summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--apaperless.cabal4
-rw-r--r--app/Main.hs23
2 files changed, 24 insertions, 3 deletions
diff --git a/apaperless.cabal b/apaperless.cabal
index 136017b..bc12d95 100644
--- a/apaperless.cabal
+++ b/apaperless.cabal
@@ -31,6 +31,8 @@ executable apaperless
text,
containers,
attoparsec,
- parallel-io
+ parallel-io,
+ aeson,
+ optparse-applicative
hs-source-dirs: app
default-language: GHC2021
diff --git a/app/Main.hs b/app/Main.hs
index 57e015d..b6441c8 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -9,6 +9,7 @@ import Control.Arrow (second)
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throw, throwIO)
import Control.Monad (join, when)
+import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
import Data.List
@@ -19,6 +20,7 @@ import Data.Text qualified as T
import Data.Text.Encoding qualified as T
import Debug.Trace
import GHC.Conc (getNumProcessors)
+import GHC.Generics (Generic)
import System.Directory
import System.FilePath
import System.IO.Temp (withSystemTempDirectory)
@@ -31,6 +33,7 @@ main =
parMapM_ ocr
=<< sort
. map ("originals" </>)
+ . filter (not . (".attrs" `isSuffixOf`))
. filter (not . (".bak" `isSuffixOf`))
<$> listDirectory "originals"
@@ -40,9 +43,10 @@ debug s x =
ocr :: FilePath -> IO ()
ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
- hasText <-
- (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict
+ originalText <-
+ T.decodeUtf8 . LB.toStrict
<$> sh (printf "pdftotext '%s' -" input)
+ let hasText = (not . T.null) . T.strip $ originalText
when (not hasText) do
let fn suffix = tmp </> takeBaseName input <> suffix
pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input)
@@ -75,6 +79,12 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf")
printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
+ let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs"
+ doesAttrsFileExist <- doesFileExist attrsFile
+ when (not doesAttrsFileExist) $
+ J.encodeFile attrsFile Attrs {..}
+ Just attrs <- J.decodeFileStrict attrsFile
+ print (attrs :: Attrs)
ocr1 :: FilePath -> FilePath -> IO FilePath
ocr1 tmp input = do
@@ -86,6 +96,15 @@ ocr1 tmp input = do
)
pure (takeBaseName input <.> "pdf")
+data Attrs = Attrs
+ { originalText :: T.Text
+ }
+ deriving (Show, Generic, Eq)
+
+instance J.ToJSON Attrs
+
+instance J.FromJSON Attrs
+
data PdfInfo = PdfInfo
{ numPages :: Int,
pageSize :: (Double, Double)