summaryrefslogtreecommitdiffstats
path: root/app
diff options
context:
space:
mode:
Diffstat (limited to 'app')
-rw-r--r--app/Main.hs172
1 files changed, 156 insertions, 16 deletions
diff --git a/app/Main.hs b/app/Main.hs
index 075414a..658b28e 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,45 +1,182 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RecordWildCards #-}
+
module Main where
-import Control.Exception (Exception, throwIO)
+import Control.Arrow (second)
+import Control.Exception (Exception, throw, throwIO)
+import Control.Monad (join, when)
+import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
import Data.List
+import Data.Map qualified as M
+import Data.Maybe (fromMaybe)
import Data.String (IsString (fromString))
+import Data.Text qualified as T
+import Data.Text.Encoding qualified as T
+import Debug.Trace
import System.Directory
import System.FilePath
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
import Text.Printf (printf)
+import Text.Read (readMaybe)
main :: IO ()
main = do
let input = "0000001.pdf"
+ ocr input
+
+debug :: Show a => String -> a -> a
+debug s x =
+ trace (printf "%s: %s\n" s (show x)) x
+ocr :: FilePath -> IO ()
+ocr input = do
withSystemTempDirectory input $ \tmp -> do
- sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp </> input))
- imageInputs <- sort <$> listDirectory tmp
- outputs <-
- mapM
- ( \imageInput -> do
+ let fn suffix = tmp </> takeBaseName input <> suffix
+ pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input)
+ pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input)
+ hasText <-
+ (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict
+ <$> sh (printf "pdftotext '%s' -" input)
+ when (not hasText) do
+ let isScan =
+ length pdfImages == pdfInfo.numPages
+ && all ((pdfInfo.pageSize ==) . imageSize) pdfImages
+ if isScan
+ then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn ""))
+ else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf"))
+ imageFiles <- sort <$> listDirectory tmp
+ -- XXX add DPI information to image so that resulting pdf preserves DPI
+ mapM_
+ ( \(pdfImage, imageFile) ->
sh_
( printf
- "tesseract '%s' '%s' pdf -psm 1 -oem 1"
- (tmp </> imageInput)
- (tmp </> imageInput)
+ "convert -density %dx%d -units PixelsPerInch '%s' '%s'"
+ pdfImage.xPpi
+ pdfImage.yPpi
+ (tmp </> imageFile)
+ (tmp </> imageFile)
)
- pure (imageInput <.> ".pdf")
)
- imageInputs
- sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) outputs ++ [printf "'%s'" (tmp </> input)]))
- copyFile input (input <.> "bak")
- copyFile (tmp </> input) ("." <> input)
- renameFile ("." <> input) input
- LB.putStr =<< sh (printf "pdftotext '%s' -" input)
+ (zip pdfImages imageFiles)
+ pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles
+ sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (tmp </> input)]))
+ copyFile input (input <.> "bak")
+ copyFile (tmp </> input) ("." <> input)
+ renameFile ("." <> input) input
+
+ocr1 :: FilePath -> FilePath -> IO FilePath
+ocr1 tmp input = do
+ sh_
+ ( printf
+ "tesseract '%s' '%s' pdf"
+ (tmp </> input)
+ (tmp </> takeBaseName input)
+ )
+ pure (takeBaseName input <.> "pdf")
+
+data PdfInfo = PdfInfo
+ { numPages :: Int,
+ pageSize :: (Double, Double)
+ }
+ deriving (Show)
+
+data PdfInfoException = PdfInfoException
+ deriving (Show)
+
+instance Exception PdfInfoException
+
+parsePdfInfo :: LB.ByteString -> PdfInfo
+parsePdfInfo out' =
+ fromMaybe (throw PdfInfoException) $ do
+ numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs
+ pageSize <-
+ rightToMaybe . A.parseOnly pageSizeParser
+ =<< M.lookup "Page size" kvs
+ pure PdfInfo {..}
+ where
+ out = T.decodeUtf8 (LB.toStrict out')
+ kvs =
+ M.fromList
+ . map (second T.stripStart)
+ . map (second T.tail . T.break (== ':'))
+ . filter (not . T.null)
+ . T.lines
+ $ out
+ pageSizeParser =
+ (,)
+ <$> (A.double <* A.string " x ")
+ <*> (A.double <* A.string " pts (A4)")
+ <* A.endOfInput
+
+type PdfImages = [PdfImage]
+
+data PdfImage = PdfImage
+ { page :: Int,
+ num :: Int,
+ type_ :: String,
+ width :: Int,
+ height :: Int,
+ color :: String,
+ comp :: Int,
+ bpc :: Int,
+ enc :: String,
+ interp :: String,
+ object :: Int,
+ id :: Int,
+ xPpi :: Int,
+ yPpi :: Int,
+ size :: String,
+ ratio :: String
+ }
+ deriving (Show)
+
+imageSize :: PdfImage -> (Double, Double)
+imageSize (PdfImage {..}) =
+ let f ppi p = 72 * fromIntegral p / fromIntegral ppi
+ in (f xPpi width, f yPpi height)
+
+data PdfImagesException = PdfImagesException
+ deriving (Show)
+
+instance Exception PdfImagesException
data ProcessException = ProcessException Int LB.ByteString
deriving (Show)
instance Exception ProcessException
+parsePdfImages :: LB.ByteString -> PdfImages
+parsePdfImages out' =
+ map
+ ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) ->
+ PdfImage
+ { page = read page',
+ num = read num',
+ width = read width',
+ height = read height',
+ comp = read comp',
+ bpc = read bpc',
+ object = read object',
+ id = read id',
+ xPpi = read xPpi',
+ yPpi = read yPpi',
+ ..
+ }
+ )
+ . map (map T.unpack)
+ . map T.words
+ . drop 2
+ . filter (not . T.null)
+ . T.lines
+ $ out
+ where
+ out = T.decodeUtf8 (LB.toStrict out')
+
sh :: String -> IO LB.ByteString
sh cmd = do
-- printf "+ %s\n" cmd
@@ -50,3 +187,6 @@ sh cmd = do
sh_ :: String -> IO ()
sh_ = fmap (\_ -> ()) . sh
+
+rightToMaybe :: Either e a -> Maybe a
+rightToMaybe = either (const Nothing) Just