diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | apaperless.cabal | 5 | ||||
-rw-r--r-- | app/Main.hs | 185 | ||||
-rw-r--r-- | default.nix | 2 |
4 files changed, 150 insertions, 43 deletions
@@ -1 +1,2 @@ /dist-newstyle +/store diff --git a/apaperless.cabal b/apaperless.cabal index bc12d95..4b0f3d6 100644 --- a/apaperless.cabal +++ b/apaperless.cabal @@ -33,6 +33,9 @@ executable apaperless attoparsec, parallel-io, aeson, - optparse-applicative + optparse-applicative, + SHA, + lock-file, + data-default hs-source-dirs: app default-language: GHC2021 diff --git a/app/Main.hs b/app/Main.hs index b6441c8..4ee4913 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,17 +1,22 @@ {-# LANGUAGE BlockArguments #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE NoFieldSelectors #-} +{-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Main where import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) -import Control.Monad (join, when) +import Control.Monad (unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB +import Data.Default +import Data.Digest.Pure.SHA (sha256, showDigest) import Data.List import Data.Map qualified as M import Data.Maybe (fromMaybe) @@ -21,33 +26,142 @@ import Data.Text.Encoding qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) +import Options.Applicative qualified as O import System.Directory +import System.Environment (getEnv) import System.FilePath +import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) import Text.Read (readMaybe) +data Args = Args + { cmd :: Cmd + } + +data Cmd + = Consume Bool [FilePath] + | List + +args :: O.Parser Args +args = + Args <$> cmd + +cmd :: O.Parser Cmd +cmd = + O.hsubparser . mconcat $ + [ O.command "consume" . O.info consumeCmd $ + O.progDesc "Consume document(s)", + O.command "list" . O.info listCmd $ + O.progDesc "List document(s)" + ] + +consumeCmd :: O.Parser Cmd +consumeCmd = + Consume + <$> keepArg + <*> filePathsArg + +listCmd :: O.Parser Cmd +listCmd = + pure List + +filePathsArg :: O.Parser [FilePath] +filePathsArg = + O.many (O.strArgument (O.metavar "FILE" <> O.action "file")) + +keepArg :: O.Parser Bool +keepArg = + O.switch + ( O.long "keep" + <> O.help "Keep input document" + ) + main :: IO () -main = - parMapM_ ocr - =<< sort - . map ("originals" </>) - . filter (not . (".attrs" `isSuffixOf`)) - . filter (not . (".bak" `isSuffixOf`)) - <$> listDirectory "originals" +main = do + cwd <- getCurrentDirectory + setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR" + ensureGit + ensureDir "originals" + ensureDir "index" + + O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case + Args {cmd = Consume keep filePaths} -> + mapM_ putStrLn + =<< parMapM (consume1 keep) (map (cwd </>) filePaths) + Args {cmd = List} -> do + mapM_ (putStrLn . takeBaseName . fst) + =<< parMapM + ( \iFileName -> + (,) iFileName <$> decodeFile @Index ("index" </> iFileName) + ) + =<< sort <$> listDirectory "index" + +ensureGit :: IO () +ensureGit = do + doesExist <- doesDirectoryExist ".git" + unless doesExist $ sh_ "git init --initial-branch main" + +ensureDir :: FilePath -> IO () +ensureDir dirName = + createDirectoryIfMissing False dirName debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x -ocr :: FilePath -> IO () -ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do - originalText <- - T.decodeUtf8 . LB.toStrict - <$> sh (printf "pdftotext '%s' -" input) - let hasText = (not . T.null) . T.strip $ originalText - when (not hasText) do +fileKey :: FilePath -> IO FilePath +fileKey filePath = + showDigest . sha256 <$> LB.readFile filePath + +consume1 :: Bool -> FilePath -> IO FilePath +consume1 keep filePath = do + fKey <- fileKey filePath + let oFilePath = "originals" </> fKey <.> takeExtension filePath + originalExists <- doesFileExist oFilePath + when originalExists do + error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath) + let iFilePath = "index" </> fKey <.> "json" + originalText <- do + originalText' <- + T.decodeUtf8 . LB.toStrict + <$> sh (printf "pdftotext '%s' -" filePath) + let hasText = (not . T.null) . T.strip $ originalText' + if not hasText + then ocr filePath + else pure originalText' + withGit do + J.encodeFile iFilePath Index {..} + if keep + then copyFile filePath oFilePath + else renameFile filePath oFilePath + commitAll + [iFilePath, oFilePath] + (printf "add %s" (takeFileName filePath)) + pure (takeBaseName iFilePath) + +withGit :: IO a -> IO a +withGit = withLockFile def ".gitlock" + +commitAll :: [FilePath] -> String -> IO () +commitAll fps m = do + sh_ ("git add -- " ++ intercalate " " (map (printf "'%s'") fps)) + sh_ (printf "git commit -m '%s'" m) + +data DecodeException = DecodeException FilePath String + deriving (Show) + +instance Exception DecodeException + +decodeFile :: J.FromJSON a => FilePath -> IO a +decodeFile fp = + either (throwIO . DecodeException fp) pure . J.eitherDecode + =<< LB.readFile fp + +ocr :: FilePath -> IO T.Text +ocr input = + withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) @@ -59,7 +173,7 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) imageFiles <- sort <$> listDirectory tmp -- XXX add DPI information to image so that resulting pdf preserves DPI - mapM_ + parMapM_ ( \(pdfImage, imageFile) -> sh_ ( printf @@ -71,39 +185,26 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do ) ) (zip pdfImages imageFiles) - pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles - sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (fn ".pdf")])) - printf "~ copyFile %s %s\n" input (input <.> "bak") - copyFile input (input <.> "bak") - printf "~ copyFile %s %s\n" (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") - copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf") - printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input - renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input - let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs" - doesAttrsFileExist <- doesFileExist attrsFile - when (not doesAttrsFileExist) $ - J.encodeFile attrsFile Attrs {..} - Just attrs <- J.decodeFileStrict attrsFile - print (attrs :: Attrs) - -ocr1 :: FilePath -> FilePath -> IO FilePath + T.unlines <$> mapM (ocr1 tmp . (tmp </>)) imageFiles + +ocr1 :: FilePath -> FilePath -> IO T.Text ocr1 tmp input = do - sh_ - ( printf - "tesseract '%s' '%s' pdf" - (tmp </> input) - (tmp </> takeBaseName input) - ) - pure (takeBaseName input <.> "pdf") + T.decodeUtf8 . LB.toStrict + <$> sh + ( printf + "tesseract '%s' '%s' pdf" + (tmp </> input) + (tmp </> takeBaseName input) + ) -data Attrs = Attrs +data Index = Index { originalText :: T.Text } deriving (Show, Generic, Eq) -instance J.ToJSON Attrs +instance J.ToJSON Index -instance J.FromJSON Attrs +instance J.FromJSON Index data PdfInfo = PdfInfo { numPages :: Int, diff --git a/default.nix b/default.nix index 16d73b0..399e153 100644 --- a/default.nix +++ b/default.nix @@ -38,6 +38,8 @@ rec { withHaddock = true; shellHook = '' HISTFILE=${pkgs.lib.escapeShellArg ./.}/.history; export HISTFILE + + APAPERLESS_STORE_DIR=${pkgs.lib.escapeShellArg ./.}/store; export APAPERLESS_STORE_DIR ''; }; } |