diff options
-rw-r--r-- | apaperless.cabal | 5 | ||||
-rw-r--r-- | app/Main.hs | 84 |
2 files changed, 52 insertions, 37 deletions
diff --git a/apaperless.cabal b/apaperless.cabal index 81f83fb..136017b 100644 --- a/apaperless.cabal +++ b/apaperless.cabal @@ -14,7 +14,7 @@ extra-doc-files: CHANGELOG.md -- extra-source-files: common warnings - ghc-options: -Wall + ghc-options: -Wall -threaded executable apaperless import: warnings @@ -30,6 +30,7 @@ executable apaperless filepath, text, containers, - attoparsec + attoparsec, + parallel-io hs-source-dirs: app default-language: GHC2021 diff --git a/app/Main.hs b/app/Main.hs index 658b28e..57e015d 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -6,6 +6,7 @@ module Main where import Control.Arrow (second) +import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (join, when) import Data.Attoparsec.Text qualified as A @@ -17,6 +18,7 @@ import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Debug.Trace +import GHC.Conc (getNumProcessors) import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) @@ -25,49 +27,54 @@ import Text.Printf (printf) import Text.Read (readMaybe) main :: IO () -main = do - let input = "0000001.pdf" - ocr input +main = + parMapM_ ocr + =<< sort + . map ("originals" </>) + . filter (not . (".bak" `isSuffixOf`)) + <$> listDirectory "originals" debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x ocr :: FilePath -> IO () -ocr input = do - withSystemTempDirectory input $ \tmp -> do +ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do + hasText <- + (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict + <$> sh (printf "pdftotext '%s' -" input) + when (not hasText) do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) - hasText <- - (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict - <$> sh (printf "pdftotext '%s' -" input) - when (not hasText) do - let isScan = - length pdfImages == pdfInfo.numPages - && all ((pdfInfo.pageSize ==) . imageSize) pdfImages - if isScan - then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) - else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) - imageFiles <- sort <$> listDirectory tmp - -- XXX add DPI information to image so that resulting pdf preserves DPI - mapM_ - ( \(pdfImage, imageFile) -> - sh_ - ( printf - "convert -density %dx%d -units PixelsPerInch '%s' '%s'" - pdfImage.xPpi - pdfImage.yPpi - (tmp </> imageFile) - (tmp </> imageFile) - ) - ) - (zip pdfImages imageFiles) - pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles - sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (tmp </> input)])) - copyFile input (input <.> "bak") - copyFile (tmp </> input) ("." <> input) - renameFile ("." <> input) input + let isScan = + length pdfImages == pdfInfo.numPages + && all ((pdfInfo.pageSize ==) . imageSize) pdfImages + if isScan + then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) + else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) + imageFiles <- sort <$> listDirectory tmp + -- XXX add DPI information to image so that resulting pdf preserves DPI + mapM_ + ( \(pdfImage, imageFile) -> + sh_ + ( printf + "convert -density %dx%d -units PixelsPerInch '%s' '%s'" + pdfImage.xPpi + pdfImage.yPpi + (tmp </> imageFile) + (tmp </> imageFile) + ) + ) + (zip pdfImages imageFiles) + pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles + sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (fn ".pdf")])) + printf "~ copyFile %s %s\n" input (input <.> "bak") + copyFile input (input <.> "bak") + printf "~ copyFile %s %s\n" (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") + copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") + printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input + renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input ocr1 :: FilePath -> FilePath -> IO FilePath ocr1 tmp input = do @@ -179,7 +186,6 @@ parsePdfImages out' = sh :: String -> IO LB.ByteString sh cmd = do - -- printf "+ %s\n" cmd (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out @@ -190,3 +196,11 @@ sh_ = fmap (\_ -> ()) . sh rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just + +parMapM :: (a -> IO b) -> [a] -> IO [b] +parMapM f xs = do + n <- getNumProcessors + withPool n $ \pool -> parallel pool (map f xs) + +parMapM_ :: (a -> IO b) -> [a] -> IO () +parMapM_ f = fmap (const ()) . parMapM f |