diff options
Diffstat (limited to 'app/Main.hs')
-rw-r--r-- | app/Main.hs | 23 |
1 files changed, 21 insertions, 2 deletions
diff --git a/app/Main.hs b/app/Main.hs index 57e015d..b6441c8 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -9,6 +9,7 @@ import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (join, when) +import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.List @@ -19,6 +20,7 @@ import Data.Text qualified as T import Data.Text.Encoding qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) +import GHC.Generics (Generic) import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) @@ -31,6 +33,7 @@ main = parMapM_ ocr =<< sort . map ("originals" </>) + . filter (not . (".attrs" `isSuffixOf`)) . filter (not . (".bak" `isSuffixOf`)) <$> listDirectory "originals" @@ -40,9 +43,10 @@ debug s x = ocr :: FilePath -> IO () ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do - hasText <- - (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict + originalText <- + T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext '%s' -" input) + let hasText = (not . T.null) . T.strip $ originalText when (not hasText) do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) @@ -75,6 +79,12 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input + let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs" + doesAttrsFileExist <- doesFileExist attrsFile + when (not doesAttrsFileExist) $ + J.encodeFile attrsFile Attrs {..} + Just attrs <- J.decodeFileStrict attrsFile + print (attrs :: Attrs) ocr1 :: FilePath -> FilePath -> IO FilePath ocr1 tmp input = do @@ -86,6 +96,15 @@ ocr1 tmp input = do ) pure (takeBaseName input <.> "pdf") +data Attrs = Attrs + { originalText :: T.Text + } + deriving (Show, Generic, Eq) + +instance J.ToJSON Attrs + +instance J.FromJSON Attrs + data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) |