summaryrefslogtreecommitdiffstats
path: root/app/Main.hs
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-21 13:09:13 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-21 13:09:17 +0100
commit70a55c75617aaacd7da25a232273bf1d81e855bf (patch)
tree921bcc9170ef26e6f279e98575c3bb3416de461e /app/Main.hs
parent65fe1bd03b00a6372b7eabefb6a7380a4451d3a3 (diff)
chore: add attrs
Diffstat (limited to 'app/Main.hs')
-rw-r--r--app/Main.hs23
1 files changed, 21 insertions, 2 deletions
diff --git a/app/Main.hs b/app/Main.hs
index 57e015d..b6441c8 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -9,6 +9,7 @@ import Control.Arrow (second)
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throw, throwIO)
import Control.Monad (join, when)
+import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
import Data.List
@@ -19,6 +20,7 @@ import Data.Text qualified as T
import Data.Text.Encoding qualified as T
import Debug.Trace
import GHC.Conc (getNumProcessors)
+import GHC.Generics (Generic)
import System.Directory
import System.FilePath
import System.IO.Temp (withSystemTempDirectory)
@@ -31,6 +33,7 @@ main =
parMapM_ ocr
=<< sort
. map ("originals" </>)
+ . filter (not . (".attrs" `isSuffixOf`))
. filter (not . (".bak" `isSuffixOf`))
<$> listDirectory "originals"
@@ -40,9 +43,10 @@ debug s x =
ocr :: FilePath -> IO ()
ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
- hasText <-
- (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict
+ originalText <-
+ T.decodeUtf8 . LB.toStrict
<$> sh (printf "pdftotext '%s' -" input)
+ let hasText = (not . T.null) . T.strip $ originalText
when (not hasText) do
let fn suffix = tmp </> takeBaseName input <> suffix
pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input)
@@ -75,6 +79,12 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf")
printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
+ let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs"
+ doesAttrsFileExist <- doesFileExist attrsFile
+ when (not doesAttrsFileExist) $
+ J.encodeFile attrsFile Attrs {..}
+ Just attrs <- J.decodeFileStrict attrsFile
+ print (attrs :: Attrs)
ocr1 :: FilePath -> FilePath -> IO FilePath
ocr1 tmp input = do
@@ -86,6 +96,15 @@ ocr1 tmp input = do
)
pure (takeBaseName input <.> "pdf")
+data Attrs = Attrs
+ { originalText :: T.Text
+ }
+ deriving (Show, Generic, Eq)
+
+instance J.ToJSON Attrs
+
+instance J.FromJSON Attrs
+
data PdfInfo = PdfInfo
{ numPages :: Int,
pageSize :: (Double, Double)