{-# LANGUAGE BlockArguments #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} module Main where import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (join, when) import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.List import Data.Map qualified as M import Data.Maybe (fromMaybe) import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) import Text.Read (readMaybe) main :: IO () main = parMapM_ ocr =<< sort . map ("originals" ) . filter (not . (".bak" `isSuffixOf`)) <$> listDirectory "originals" debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x ocr :: FilePath -> IO () ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do hasText <- (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext '%s' -" input) when (not hasText) do let fn suffix = tmp takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) let isScan = length pdfImages == pdfInfo.numPages && all ((pdfInfo.pageSize ==) . imageSize) pdfImages if isScan then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) imageFiles <- sort <$> listDirectory tmp -- XXX add DPI information to image so that resulting pdf preserves DPI mapM_ ( \(pdfImage, imageFile) -> sh_ ( printf "convert -density %dx%d -units PixelsPerInch '%s' '%s'" pdfImage.xPpi pdfImage.yPpi (tmp imageFile) (tmp imageFile) ) ) (zip pdfImages imageFiles) pdfFiles <- mapM (ocr1 tmp . (tmp )) imageFiles sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp )) pdfFiles ++ [printf "'%s'" (fn ".pdf")])) printf "~ copyFile %s %s\n" input (input <.> "bak") copyFile input (input <.> "bak") printf "~ copyFile %s %s\n" (fn ".pdf") (takeDirectory input "." <> takeBaseName input <.> "pdf") copyFile (fn ".pdf") (takeDirectory input "." <> takeBaseName input <.> "pdf") printf "~ renameFile %s %s\n" (takeDirectory input "." <> takeBaseName input <.> "pdf") input renameFile (takeDirectory input "." <> takeBaseName input <.> "pdf") input ocr1 :: FilePath -> FilePath -> IO FilePath ocr1 tmp input = do sh_ ( printf "tesseract '%s' '%s' pdf" (tmp input) (tmp takeBaseName input) ) pure (takeBaseName input <.> "pdf") data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) } deriving (Show) data PdfInfoException = PdfInfoException deriving (Show) instance Exception PdfInfoException parsePdfInfo :: LB.ByteString -> PdfInfo parsePdfInfo out' = fromMaybe (throw PdfInfoException) $ do numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs pageSize <- rightToMaybe . A.parseOnly pageSizeParser =<< M.lookup "Page size" kvs pure PdfInfo {..} where out = T.decodeUtf8 (LB.toStrict out') kvs = M.fromList . map (second T.stripStart) . map (second T.tail . T.break (== ':')) . filter (not . T.null) . T.lines $ out pageSizeParser = (,) <$> (A.double <* A.string " x ") <*> (A.double <* A.string " pts (A4)") <* A.endOfInput type PdfImages = [PdfImage] data PdfImage = PdfImage { page :: Int, num :: Int, type_ :: String, width :: Int, height :: Int, color :: String, comp :: Int, bpc :: Int, enc :: String, interp :: String, object :: Int, id :: Int, xPpi :: Int, yPpi :: Int, size :: String, ratio :: String } deriving (Show) imageSize :: PdfImage -> (Double, Double) imageSize (PdfImage {..}) = let f ppi p = 72 * fromIntegral p / fromIntegral ppi in (f xPpi width, f yPpi height) data PdfImagesException = PdfImagesException deriving (Show) instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException parsePdfImages :: LB.ByteString -> PdfImages parsePdfImages out' = map ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> PdfImage { page = read page', num = read num', width = read width', height = read height', comp = read comp', bpc = read bpc', object = read object', id = read id', xPpi = read xPpi', yPpi = read yPpi', .. } ) . map (map T.unpack) . map T.words . drop 2 . filter (not . T.null) . T.lines $ out where out = T.decodeUtf8 (LB.toStrict out') sh :: String -> IO LB.ByteString sh cmd = do (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err sh_ :: String -> IO () sh_ = fmap (\_ -> ()) . sh rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just parMapM :: (a -> IO b) -> [a] -> IO [b] parMapM f xs = do n <- getNumProcessors withPool n $ \pool -> parallel pool (map f xs) parMapM_ :: (a -> IO b) -> [a] -> IO () parMapM_ f = fmap (const ()) . parMapM f