diff options
Diffstat (limited to 'app')
-rw-r--r-- | app/Main.hs | 172 |
1 files changed, 156 insertions, 16 deletions
diff --git a/app/Main.hs b/app/Main.hs index 075414a..658b28e 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,45 +1,182 @@ +{-# LANGUAGE BlockArguments #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} + module Main where -import Control.Exception (Exception, throwIO) +import Control.Arrow (second) +import Control.Exception (Exception, throw, throwIO) +import Control.Monad (join, when) +import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.List +import Data.Map qualified as M +import Data.Maybe (fromMaybe) import Data.String (IsString (fromString)) +import Data.Text qualified as T +import Data.Text.Encoding qualified as T +import Debug.Trace import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) +import Text.Read (readMaybe) main :: IO () main = do let input = "0000001.pdf" + ocr input + +debug :: Show a => String -> a -> a +debug s x = + trace (printf "%s: %s\n" s (show x)) x +ocr :: FilePath -> IO () +ocr input = do withSystemTempDirectory input $ \tmp -> do - sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp </> input)) - imageInputs <- sort <$> listDirectory tmp - outputs <- - mapM - ( \imageInput -> do + let fn suffix = tmp </> takeBaseName input <> suffix + pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) + pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) + hasText <- + (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict + <$> sh (printf "pdftotext '%s' -" input) + when (not hasText) do + let isScan = + length pdfImages == pdfInfo.numPages + && all ((pdfInfo.pageSize ==) . imageSize) pdfImages + if isScan + then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) + else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) + imageFiles <- sort <$> listDirectory tmp + -- XXX add DPI information to image so that resulting pdf preserves DPI + mapM_ + ( \(pdfImage, imageFile) -> sh_ ( printf - "tesseract '%s' '%s' pdf -psm 1 -oem 1" - (tmp </> imageInput) - (tmp </> imageInput) + "convert -density %dx%d -units PixelsPerInch '%s' '%s'" + pdfImage.xPpi + pdfImage.yPpi + (tmp </> imageFile) + (tmp </> imageFile) ) - pure (imageInput <.> ".pdf") ) - imageInputs - sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) outputs ++ [printf "'%s'" (tmp </> input)])) - copyFile input (input <.> "bak") - copyFile (tmp </> input) ("." <> input) - renameFile ("." <> input) input - LB.putStr =<< sh (printf "pdftotext '%s' -" input) + (zip pdfImages imageFiles) + pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles + sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (tmp </> input)])) + copyFile input (input <.> "bak") + copyFile (tmp </> input) ("." <> input) + renameFile ("." <> input) input + +ocr1 :: FilePath -> FilePath -> IO FilePath +ocr1 tmp input = do + sh_ + ( printf + "tesseract '%s' '%s' pdf" + (tmp </> input) + (tmp </> takeBaseName input) + ) + pure (takeBaseName input <.> "pdf") + +data PdfInfo = PdfInfo + { numPages :: Int, + pageSize :: (Double, Double) + } + deriving (Show) + +data PdfInfoException = PdfInfoException + deriving (Show) + +instance Exception PdfInfoException + +parsePdfInfo :: LB.ByteString -> PdfInfo +parsePdfInfo out' = + fromMaybe (throw PdfInfoException) $ do + numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs + pageSize <- + rightToMaybe . A.parseOnly pageSizeParser + =<< M.lookup "Page size" kvs + pure PdfInfo {..} + where + out = T.decodeUtf8 (LB.toStrict out') + kvs = + M.fromList + . map (second T.stripStart) + . map (second T.tail . T.break (== ':')) + . filter (not . T.null) + . T.lines + $ out + pageSizeParser = + (,) + <$> (A.double <* A.string " x ") + <*> (A.double <* A.string " pts (A4)") + <* A.endOfInput + +type PdfImages = [PdfImage] + +data PdfImage = PdfImage + { page :: Int, + num :: Int, + type_ :: String, + width :: Int, + height :: Int, + color :: String, + comp :: Int, + bpc :: Int, + enc :: String, + interp :: String, + object :: Int, + id :: Int, + xPpi :: Int, + yPpi :: Int, + size :: String, + ratio :: String + } + deriving (Show) + +imageSize :: PdfImage -> (Double, Double) +imageSize (PdfImage {..}) = + let f ppi p = 72 * fromIntegral p / fromIntegral ppi + in (f xPpi width, f yPpi height) + +data PdfImagesException = PdfImagesException + deriving (Show) + +instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException +parsePdfImages :: LB.ByteString -> PdfImages +parsePdfImages out' = + map + ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> + PdfImage + { page = read page', + num = read num', + width = read width', + height = read height', + comp = read comp', + bpc = read bpc', + object = read object', + id = read id', + xPpi = read xPpi', + yPpi = read yPpi', + .. + } + ) + . map (map T.unpack) + . map T.words + . drop 2 + . filter (not . T.null) + . T.lines + $ out + where + out = T.decodeUtf8 (LB.toStrict out') + sh :: String -> IO LB.ByteString sh cmd = do -- printf "+ %s\n" cmd @@ -50,3 +187,6 @@ sh cmd = do sh_ :: String -> IO () sh_ = fmap (\_ -> ()) . sh + +rightToMaybe :: Either e a -> Maybe a +rightToMaybe = either (const Nothing) Just |