From eb584575ca1fb1420c217452d659460aa5736663 Mon Sep 17 00:00:00 2001
From: Alexander Foremny <aforemny@posteo.de>
Date: Fri, 22 Dec 2023 03:16:44 +0100
Subject: chore: add commands `consume`, `list`

---
 app/Main.hs | 185 ++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 143 insertions(+), 42 deletions(-)

(limited to 'app')

diff --git a/app/Main.hs b/app/Main.hs
index b6441c8..4ee4913 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,17 +1,22 @@
 {-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE OverloadedRecordDot #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE NoFieldSelectors #-}
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
 
 module Main where
 
 import Control.Arrow (second)
 import Control.Concurrent.ParallelIO.Local (parallel, withPool)
 import Control.Exception (Exception, throw, throwIO)
-import Control.Monad (join, when)
+import Control.Monad (unless, when)
 import Data.Aeson qualified as J
 import Data.Attoparsec.Text qualified as A
 import Data.ByteString.Lazy qualified as LB
+import Data.Default
+import Data.Digest.Pure.SHA (sha256, showDigest)
 import Data.List
 import Data.Map qualified as M
 import Data.Maybe (fromMaybe)
@@ -21,33 +26,142 @@ import Data.Text.Encoding qualified as T
 import Debug.Trace
 import GHC.Conc (getNumProcessors)
 import GHC.Generics (Generic)
+import Options.Applicative qualified as O
 import System.Directory
+import System.Environment (getEnv)
 import System.FilePath
+import System.IO.LockFile (withLockFile)
 import System.IO.Temp (withSystemTempDirectory)
 import System.Process.Typed
 import Text.Printf (printf)
 import Text.Read (readMaybe)
 
+data Args = Args
+  { cmd :: Cmd
+  }
+
+data Cmd
+  = Consume Bool [FilePath]
+  | List
+
+args :: O.Parser Args
+args =
+  Args <$> cmd
+
+cmd :: O.Parser Cmd
+cmd =
+  O.hsubparser . mconcat $
+    [ O.command "consume" . O.info consumeCmd $
+        O.progDesc "Consume document(s)",
+      O.command "list" . O.info listCmd $
+        O.progDesc "List document(s)"
+    ]
+
+consumeCmd :: O.Parser Cmd
+consumeCmd =
+  Consume
+    <$> keepArg
+    <*> filePathsArg
+
+listCmd :: O.Parser Cmd
+listCmd =
+  pure List
+
+filePathsArg :: O.Parser [FilePath]
+filePathsArg =
+  O.many (O.strArgument (O.metavar "FILE" <> O.action "file"))
+
+keepArg :: O.Parser Bool
+keepArg =
+  O.switch
+    ( O.long "keep"
+        <> O.help "Keep input document"
+    )
+
 main :: IO ()
-main =
-  parMapM_ ocr
-    =<< sort
-      . map ("originals" </>)
-      . filter (not . (".attrs" `isSuffixOf`))
-      . filter (not . (".bak" `isSuffixOf`))
-      <$> listDirectory "originals"
+main = do
+  cwd <- getCurrentDirectory
+  setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR"
+  ensureGit
+  ensureDir "originals"
+  ensureDir "index"
+
+  O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case
+    Args {cmd = Consume keep filePaths} ->
+      mapM_ putStrLn
+        =<< parMapM (consume1 keep) (map (cwd </>) filePaths)
+    Args {cmd = List} -> do
+      mapM_ (putStrLn . takeBaseName . fst)
+        =<< parMapM
+          ( \iFileName ->
+              (,) iFileName <$> decodeFile @Index ("index" </> iFileName)
+          )
+        =<< sort <$> listDirectory "index"
+
+ensureGit :: IO ()
+ensureGit = do
+  doesExist <- doesDirectoryExist ".git"
+  unless doesExist $ sh_ "git init --initial-branch main"
+
+ensureDir :: FilePath -> IO ()
+ensureDir dirName =
+  createDirectoryIfMissing False dirName
 
 debug :: Show a => String -> a -> a
 debug s x =
   trace (printf "%s: %s\n" s (show x)) x
 
-ocr :: FilePath -> IO ()
-ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
-  originalText <-
-    T.decodeUtf8 . LB.toStrict
-      <$> sh (printf "pdftotext '%s' -" input)
-  let hasText = (not . T.null) . T.strip $ originalText
-  when (not hasText) do
+fileKey :: FilePath -> IO FilePath
+fileKey filePath =
+  showDigest . sha256 <$> LB.readFile filePath
+
+consume1 :: Bool -> FilePath -> IO FilePath
+consume1 keep filePath = do
+  fKey <- fileKey filePath
+  let oFilePath = "originals" </> fKey <.> takeExtension filePath
+  originalExists <- doesFileExist oFilePath
+  when originalExists do
+    error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath)
+  let iFilePath = "index" </> fKey <.> "json"
+  originalText <- do
+    originalText' <-
+      T.decodeUtf8 . LB.toStrict
+        <$> sh (printf "pdftotext '%s' -" filePath)
+    let hasText = (not . T.null) . T.strip $ originalText'
+    if not hasText
+      then ocr filePath
+      else pure originalText'
+  withGit do
+    J.encodeFile iFilePath Index {..}
+    if keep
+      then copyFile filePath oFilePath
+      else renameFile filePath oFilePath
+    commitAll
+      [iFilePath, oFilePath]
+      (printf "add %s" (takeFileName filePath))
+  pure (takeBaseName iFilePath)
+
+withGit :: IO a -> IO a
+withGit = withLockFile def ".gitlock"
+
+commitAll :: [FilePath] -> String -> IO ()
+commitAll fps m = do
+  sh_ ("git add -- " ++ intercalate " " (map (printf "'%s'") fps))
+  sh_ (printf "git commit -m '%s'" m)
+
+data DecodeException = DecodeException FilePath String
+  deriving (Show)
+
+instance Exception DecodeException
+
+decodeFile :: J.FromJSON a => FilePath -> IO a
+decodeFile fp =
+  either (throwIO . DecodeException fp) pure . J.eitherDecode
+    =<< LB.readFile fp
+
+ocr :: FilePath -> IO T.Text
+ocr input =
+  withSystemTempDirectory (takeBaseName input) $ \tmp -> do
     let fn suffix = tmp </> takeBaseName input <> suffix
     pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input)
     pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input)
@@ -59,7 +173,7 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
       else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf"))
     imageFiles <- sort <$> listDirectory tmp
     -- XXX add DPI information to image so that resulting pdf preserves DPI
-    mapM_
+    parMapM_
       ( \(pdfImage, imageFile) ->
           sh_
             ( printf
@@ -71,39 +185,26 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
             )
       )
       (zip pdfImages imageFiles)
-    pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles
-    sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (fn ".pdf")]))
-    printf "~ copyFile %s %s\n" input (input <.> "bak")
-    copyFile input (input <.> "bak")
-    printf "~ copyFile %s %s\n" (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf")
-    copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf")
-    printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
-    renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
-  let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs"
-  doesAttrsFileExist <- doesFileExist attrsFile
-  when (not doesAttrsFileExist) $
-    J.encodeFile attrsFile Attrs {..}
-  Just attrs <- J.decodeFileStrict attrsFile
-  print (attrs :: Attrs)
-
-ocr1 :: FilePath -> FilePath -> IO FilePath
+    T.unlines <$> mapM (ocr1 tmp . (tmp </>)) imageFiles
+
+ocr1 :: FilePath -> FilePath -> IO T.Text
 ocr1 tmp input = do
-  sh_
-    ( printf
-        "tesseract '%s' '%s' pdf"
-        (tmp </> input)
-        (tmp </> takeBaseName input)
-    )
-  pure (takeBaseName input <.> "pdf")
+  T.decodeUtf8 . LB.toStrict
+    <$> sh
+      ( printf
+          "tesseract '%s' '%s' pdf"
+          (tmp </> input)
+          (tmp </> takeBaseName input)
+      )
 
-data Attrs = Attrs
+data Index = Index
   { originalText :: T.Text
   }
   deriving (Show, Generic, Eq)
 
-instance J.ToJSON Attrs
+instance J.ToJSON Index
 
-instance J.FromJSON Attrs
+instance J.FromJSON Index
 
 data PdfInfo = PdfInfo
   { numPages :: Int,
-- 
cgit v1.2.3