#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import signal
import logging
import re
import os
import sys
import optparse
import csv
import fnmatch
import subprocess
# ===========================================================================================================
# Default configuration
JPG_OUTPUT_DIR = 'jpg' # Default Text files output directory
JPG_RESOLUTION_DPI = 400 # Default Text files output directory
TEXT_OUTPUT_DIR = 'text' # Default Text files output directory
RESUME_OCR = False
# :\Program Files\gs\gs9.07\
if os.name == 'nt':
GS_PROG = r"path_to_gswin32c.exe"
TESSERACT_PROG = r"path_to_tesseract.exe"
else:
GS_PROG = '/usr/bin/gs'
TESSERACT_PROG = '/usr/bin/tesseract'
# ===========================================================================================================
logger = logging. getLogger ('pdf2txt')
logger. addHandler (logging. StreamHandler ())
logger. setLevel (logging. DEBUG)
usage = "Usage: %prog [options] <pdf directory>"
def parse_command_line(argv):
"""Command line options parser
"""
parser = optparse.OptionParser(add_help_option=True, usage=usage)
parser.add_option("-d", "--dpi", action="store",
type="int", dest="dpi", default=JPG_RESOLUTION_DPI,
help="JPEG Resolution in DPI (default: {0:d})".format((JPG_RESOLUTION_DPI)))
parser.add_option("-j", "--jpgdir", action="store",
type="string", dest="jpgdir", default=JPG_OUTPUT_DIR,
help="JPEG output directory (default: {0!s})".format((JPG_OUTPUT_DIR)))
parser.add_option("-t", "--textdir", action="store",
type="string", dest="txtdir", default=TEXT_OUTPUT_DIR,
help="Text output directory (default: {0!s})".format((TEXT_OUTPUT_DIR)))
parser.add_option("-r", "--resume", action="store_true",
dest="resume", default=RESUME_OCR,
help="Resume OCR to Text (default: {0!s})".format((RESUME_OCR)))
return parser.parse_args(argv)
def getSize(filename):
"""Returns file size
"""
try:
return os.path.getsize(filename)
except:
return 0
def jpg_to_text(options, filename, rootdir):
"""OCR JPEG files and save as TEXT files
"""
try:
relpath = os.path.relpath(filename)
except:
relpath = os.path.splitdrive(filename)[1]
relpath = re.sub('^' + re.escape(options.jpgdir), '', relpath, flags=re.I)
relpath = re.sub(r'^[\.|\\|\/]*', '', relpath)
extdir = rootdir + '/' + os.path.dirname(relpath)
fname = extdir + '/' + os.path.basename(relpath)
fname = os.path.splitext(fname)[0]
print("OCR JPG to TEXT: {0!s}".format((filename)))
try:
if not os.path.exists(extdir):
os.makedirs(extdir)
if getSize(fname + ".txt") == 0 or not options.resume:
p = subprocess.Popen([TESSERACT_PROG, filename, fname], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
return out, err
else:
return "Resume, file exist skipped", ""
except:
pass
return "", "ERROR"
def pdf_to_jpg(filename, rootdir, dpi):
"""Convert PDF files to JPEG files
"""
try:
relpath = os.path.relpath(filename)
except:
relpath = os.path.splitdrive(filename)[1]
relpath = re.sub(r'^[\.|\\|\/]*', '', relpath)
extdir = rootdir + '/' + os.path.dirname(relpath)
fname = extdir + '/' + os.path.basename(relpath)
fname = os.path.splitext(fname)[0] + '-%d.jpg'
print("Convert PDF to JPG: {0!s}".format((filename)))
try:
if not os.path.exists(extdir):
os.makedirs(extdir)
#gswin32c -dNOPAUSE -r150 -sDEVICE=jpeg -dBATCH -sOutputFile=out-%d.jpg
p = subprocess.Popen([GS_PROG, "-dNOPAUSE", "-r" + str(dpi), "-sDEVICE=jpeg", "-dBATCH", "-sOutputFile=" + fname, filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
return out, err
except:
print("ERROR")
return None, None
def main(options, args):
""" Main Entry Point
"""
rootdir = args[1]
if not options.resume:
# For each PDF files in folder and sub-folders
for root, dirnames, filenames in os.walk(rootdir):
for filename in fnmatch.filter(filenames, '*.pdf'):
fname = os.path.join(root, filename)
out, err = pdf_to_jpg(fname, options.jpgdir, options.dpi)
print out, err
# For each JPG files in folder and sub-folders
for root, dirnames, filenames in os.walk(options.jpgdir):
for filename in fnmatch.filter(filenames, '*.jpg'):
fname = os.path.join(root, filename)
out, err = jpg_to_text(options, fname, options.txtdir)
print (out, err)
def signal_handler(signal, frame):
print 'You pressed Ctrl+C!'
os._exit(1)
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding('utf-8')
signal.signal(signal.SIGINT, signal_handler)
print("{0!s} - r2 (2013/06/15)\n".format((os.path.basename(sys.argv[0]))))
(options, args) = parse_command_line(sys.argv)
if len(args) < 2:
print("Please specify root directory of PDF input files (-h/--help for help)")
sys.exit(-1)
main(options, args)