diff options
author | Alexander Foremny <aforemny@posteo.de> | 2023-12-21 13:09:13 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2023-12-21 13:09:17 +0100 |
commit | 70a55c75617aaacd7da25a232273bf1d81e855bf (patch) | |
tree | 921bcc9170ef26e6f279e98575c3bb3416de461e /app | |
parent | 65fe1bd03b00a6372b7eabefb6a7380a4451d3a3 (diff) |
chore: add attrs
Diffstat (limited to 'app')
-rw-r--r-- | app/Main.hs | 23 |
1 files changed, 21 insertions, 2 deletions
diff --git a/app/Main.hs b/app/Main.hs index 57e015d..b6441c8 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -9,6 +9,7 @@ import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (join, when) +import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.List @@ -19,6 +20,7 @@ import Data.Text qualified as T import Data.Text.Encoding qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) +import GHC.Generics (Generic) import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) @@ -31,6 +33,7 @@ main = parMapM_ ocr =<< sort . map ("originals" </>) + . filter (not . (".attrs" `isSuffixOf`)) . filter (not . (".bak" `isSuffixOf`)) <$> listDirectory "originals" @@ -40,9 +43,10 @@ debug s x = ocr :: FilePath -> IO () ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do - hasText <- - (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict + originalText <- + T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext '%s' -" input) + let hasText = (not . T.null) . T.strip $ originalText when (not hasText) do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) @@ -75,6 +79,12 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input + let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs" + doesAttrsFileExist <- doesFileExist attrsFile + when (not doesAttrsFileExist) $ + J.encodeFile attrsFile Attrs {..} + Just attrs <- J.decodeFileStrict attrsFile + print (attrs :: Attrs) ocr1 :: FilePath -> FilePath -> IO FilePath ocr1 tmp input = do @@ -86,6 +96,15 @@ ocr1 tmp input = do ) pure (takeBaseName input <.> "pdf") +data Attrs = Attrs + { originalText :: T.Text + } + deriving (Show, Generic, Eq) + +instance J.ToJSON Attrs + +instance J.FromJSON Attrs + data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) |