<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Text on Saleem Ansari</title>
    <link>/tags/text/</link>
    <description>Recent content in Text on Saleem Ansari</description>
    <generator>Hugo -- gohugo.io</generator>
    <language>en</language>
    <copyright>(c) 2024 Saleem Ansari</copyright>
    <lastBuildDate>Sun, 20 Dec 2015 04:12:29 +0530</lastBuildDate>
    <atom:link href="/tags/text/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Extract Text From PDF</title>
      <link>/2015/12/20/extract-text-from-pdf/</link>
      <pubDate>Sun, 20 Dec 2015 04:12:29 +0530</pubDate>
      <guid>/2015/12/20/extract-text-from-pdf/</guid>
      <description>Extract Text from Images in multi-page PDF To extract text from PDF, you would need two software installed on your machine.&#xA;ghostscript&#xA;tesseract OCR&#xA;Installing these on Fedora is very easy:&#xA;$ sudo yum install -y ghostscript tesseract Now if your PDF file is named story.pdf the you can extract text as follows:&#xA;$ ghostscript -dNOPAUSE -dBATCH -sDEVICE=pngalpha -r300 -sOutputFile=&amp;quot;page%03d&amp;quot;.png story.pdf $ for f in page*.png ; do tesseract $f $f.</description>
    </item>
  </channel>
</rss>
