diff options
author | Alexander Foremny <aforemny@posteo.de> | 2023-12-21 06:18:17 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2023-12-21 12:49:48 +0100 |
commit | 65fe1bd03b00a6372b7eabefb6a7380a4451d3a3 (patch) | |
tree | b9b1a4f8a47d7f78442b42742f1756b8a770f635 /app | |
parent | 52255a4158af8dfbd8f7a4dfbe14b4d46df2c601 (diff) |
chore: scan ./originals
Diffstat (limited to 'app')
-rw-r--r-- | app/Main.hs | 84 |
1 files changed, 49 insertions, 35 deletions
diff --git a/app/Main.hs b/app/Main.hs index 658b28e..57e015d 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -6,6 +6,7 @@ module Main where import Control.Arrow (second) +import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (join, when) import Data.Attoparsec.Text qualified as A @@ -17,6 +18,7 @@ import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Debug.Trace +import GHC.Conc (getNumProcessors) import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) @@ -25,49 +27,54 @@ import Text.Printf (printf) import Text.Read (readMaybe) main :: IO () -main = do - let input = "0000001.pdf" - ocr input +main = + parMapM_ ocr + =<< sort + . map ("originals" </>) + . filter (not . (".bak" `isSuffixOf`)) + <$> listDirectory "originals" debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x ocr :: FilePath -> IO () -ocr input = do - withSystemTempDirectory input $ \tmp -> do +ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do + hasText <- + (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict + <$> sh (printf "pdftotext '%s' -" input) + when (not hasText) do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) - hasText <- - (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict - <$> sh (printf "pdftotext '%s' -" input) - when (not hasText) do - let isScan = - length pdfImages == pdfInfo.numPages - && all ((pdfInfo.pageSize ==) . imageSize) pdfImages - if isScan - then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) - else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) - imageFiles <- sort <$> listDirectory tmp - -- XXX add DPI information to image so that resulting pdf preserves DPI - mapM_ - ( \(pdfImage, imageFile) -> - sh_ - ( printf - "convert -density %dx%d -units PixelsPerInch '%s' '%s'" - pdfImage.xPpi - pdfImage.yPpi - (tmp </> imageFile) - (tmp </> imageFile) - ) - ) - (zip pdfImages imageFiles) - pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles - sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (tmp </> input)])) - copyFile input (input <.> "bak") - copyFile (tmp </> input) ("." <> input) - renameFile ("." <> input) input + let isScan = + length pdfImages == pdfInfo.numPages + && all ((pdfInfo.pageSize ==) . imageSize) pdfImages + if isScan + then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) + else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) + imageFiles <- sort <$> listDirectory tmp + -- XXX add DPI information to image so that resulting pdf preserves DPI + mapM_ + ( \(pdfImage, imageFile) -> + sh_ + ( printf + "convert -density %dx%d -units PixelsPerInch '%s' '%s'" + pdfImage.xPpi + pdfImage.yPpi + (tmp </> imageFile) + (tmp </> imageFile) + ) + ) + (zip pdfImages imageFiles) + pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles + sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (fn ".pdf")])) + printf "~ copyFile %s %s\n" input (input <.> "bak") + copyFile input (input <.> "bak") + printf "~ copyFile %s %s\n" (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") + copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") + printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input + renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input ocr1 :: FilePath -> FilePath -> IO FilePath ocr1 tmp input = do @@ -179,7 +186,6 @@ parsePdfImages out' = sh :: String -> IO LB.ByteString sh cmd = do - -- printf "+ %s\n" cmd (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out @@ -190,3 +196,11 @@ sh_ = fmap (\_ -> ()) . sh rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just + +parMapM :: (a -> IO b) -> [a] -> IO [b] +parMapM f xs = do + n <- getNumProcessors + withPool n $ \pool -> parallel pool (map f xs) + +parMapM_ :: (a -> IO b) -> [a] -> IO () +parMapM_ f = fmap (const ()) . parMapM f |