summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--apaperless.cabal5
-rw-r--r--app/Main.hs185
-rw-r--r--default.nix2
4 files changed, 150 insertions, 43 deletions
diff --git a/.gitignore b/.gitignore
index 8075013..9dee242 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
/dist-newstyle
+/store
diff --git a/apaperless.cabal b/apaperless.cabal
index bc12d95..4b0f3d6 100644
--- a/apaperless.cabal
+++ b/apaperless.cabal
@@ -33,6 +33,9 @@ executable apaperless
attoparsec,
parallel-io,
aeson,
- optparse-applicative
+ optparse-applicative,
+ SHA,
+ lock-file,
+ data-default
hs-source-dirs: app
default-language: GHC2021
diff --git a/app/Main.hs b/app/Main.hs
index b6441c8..4ee4913 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,17 +1,22 @@
{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE NoFieldSelectors #-}
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
module Main where
import Control.Arrow (second)
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throw, throwIO)
-import Control.Monad (join, when)
+import Control.Monad (unless, when)
import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
+import Data.Default
+import Data.Digest.Pure.SHA (sha256, showDigest)
import Data.List
import Data.Map qualified as M
import Data.Maybe (fromMaybe)
@@ -21,33 +26,142 @@ import Data.Text.Encoding qualified as T
import Debug.Trace
import GHC.Conc (getNumProcessors)
import GHC.Generics (Generic)
+import Options.Applicative qualified as O
import System.Directory
+import System.Environment (getEnv)
import System.FilePath
+import System.IO.LockFile (withLockFile)
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
import Text.Printf (printf)
import Text.Read (readMaybe)
+data Args = Args
+ { cmd :: Cmd
+ }
+
+data Cmd
+ = Consume Bool [FilePath]
+ | List
+
+args :: O.Parser Args
+args =
+ Args <$> cmd
+
+cmd :: O.Parser Cmd
+cmd =
+ O.hsubparser . mconcat $
+ [ O.command "consume" . O.info consumeCmd $
+ O.progDesc "Consume document(s)",
+ O.command "list" . O.info listCmd $
+ O.progDesc "List document(s)"
+ ]
+
+consumeCmd :: O.Parser Cmd
+consumeCmd =
+ Consume
+ <$> keepArg
+ <*> filePathsArg
+
+listCmd :: O.Parser Cmd
+listCmd =
+ pure List
+
+filePathsArg :: O.Parser [FilePath]
+filePathsArg =
+ O.many (O.strArgument (O.metavar "FILE" <> O.action "file"))
+
+keepArg :: O.Parser Bool
+keepArg =
+ O.switch
+ ( O.long "keep"
+ <> O.help "Keep input document"
+ )
+
main :: IO ()
-main =
- parMapM_ ocr
- =<< sort
- . map ("originals" </>)
- . filter (not . (".attrs" `isSuffixOf`))
- . filter (not . (".bak" `isSuffixOf`))
- <$> listDirectory "originals"
+main = do
+ cwd <- getCurrentDirectory
+ setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR"
+ ensureGit
+ ensureDir "originals"
+ ensureDir "index"
+
+ O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case
+ Args {cmd = Consume keep filePaths} ->
+ mapM_ putStrLn
+ =<< parMapM (consume1 keep) (map (cwd </>) filePaths)
+ Args {cmd = List} -> do
+ mapM_ (putStrLn . takeBaseName . fst)
+ =<< parMapM
+ ( \iFileName ->
+ (,) iFileName <$> decodeFile @Index ("index" </> iFileName)
+ )
+ =<< sort <$> listDirectory "index"
+
+ensureGit :: IO ()
+ensureGit = do
+ doesExist <- doesDirectoryExist ".git"
+ unless doesExist $ sh_ "git init --initial-branch main"
+
+ensureDir :: FilePath -> IO ()
+ensureDir dirName =
+ createDirectoryIfMissing False dirName
debug :: Show a => String -> a -> a
debug s x =
trace (printf "%s: %s\n" s (show x)) x
-ocr :: FilePath -> IO ()
-ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
- originalText <-
- T.decodeUtf8 . LB.toStrict
- <$> sh (printf "pdftotext '%s' -" input)
- let hasText = (not . T.null) . T.strip $ originalText
- when (not hasText) do
+fileKey :: FilePath -> IO FilePath
+fileKey filePath =
+ showDigest . sha256 <$> LB.readFile filePath
+
+consume1 :: Bool -> FilePath -> IO FilePath
+consume1 keep filePath = do
+ fKey <- fileKey filePath
+ let oFilePath = "originals" </> fKey <.> takeExtension filePath
+ originalExists <- doesFileExist oFilePath
+ when originalExists do
+ error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath)
+ let iFilePath = "index" </> fKey <.> "json"
+ originalText <- do
+ originalText' <-
+ T.decodeUtf8 . LB.toStrict
+ <$> sh (printf "pdftotext '%s' -" filePath)
+ let hasText = (not . T.null) . T.strip $ originalText'
+ if not hasText
+ then ocr filePath
+ else pure originalText'
+ withGit do
+ J.encodeFile iFilePath Index {..}
+ if keep
+ then copyFile filePath oFilePath
+ else renameFile filePath oFilePath
+ commitAll
+ [iFilePath, oFilePath]
+ (printf "add %s" (takeFileName filePath))
+ pure (takeBaseName iFilePath)
+
+withGit :: IO a -> IO a
+withGit = withLockFile def ".gitlock"
+
+commitAll :: [FilePath] -> String -> IO ()
+commitAll fps m = do
+ sh_ ("git add -- " ++ intercalate " " (map (printf "'%s'") fps))
+ sh_ (printf "git commit -m '%s'" m)
+
+data DecodeException = DecodeException FilePath String
+ deriving (Show)
+
+instance Exception DecodeException
+
+decodeFile :: J.FromJSON a => FilePath -> IO a
+decodeFile fp =
+ either (throwIO . DecodeException fp) pure . J.eitherDecode
+ =<< LB.readFile fp
+
+ocr :: FilePath -> IO T.Text
+ocr input =
+ withSystemTempDirectory (takeBaseName input) $ \tmp -> do
let fn suffix = tmp </> takeBaseName input <> suffix
pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input)
pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input)
@@ -59,7 +173,7 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf"))
imageFiles <- sort <$> listDirectory tmp
-- XXX add DPI information to image so that resulting pdf preserves DPI
- mapM_
+ parMapM_
( \(pdfImage, imageFile) ->
sh_
( printf
@@ -71,39 +185,26 @@ ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do
)
)
(zip pdfImages imageFiles)
- pdfFiles <- mapM (ocr1 tmp . (tmp </>)) imageFiles
- sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) pdfFiles ++ [printf "'%s'" (fn ".pdf")]))
- printf "~ copyFile %s %s\n" input (input <.> "bak")
- copyFile input (input <.> "bak")
- printf "~ copyFile %s %s\n" (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf")
- copyFile (fn ".pdf") (takeDirectory input </> "." <> takeBaseName input <.> "pdf")
- printf "~ renameFile %s %s\n" (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
- renameFile (takeDirectory input </> "." <> takeBaseName input <.> "pdf") input
- let attrsFile = takeDirectory input </> takeBaseName input <.> "attrs"
- doesAttrsFileExist <- doesFileExist attrsFile
- when (not doesAttrsFileExist) $
- J.encodeFile attrsFile Attrs {..}
- Just attrs <- J.decodeFileStrict attrsFile
- print (attrs :: Attrs)
-
-ocr1 :: FilePath -> FilePath -> IO FilePath
+ T.unlines <$> mapM (ocr1 tmp . (tmp </>)) imageFiles
+
+ocr1 :: FilePath -> FilePath -> IO T.Text
ocr1 tmp input = do
- sh_
- ( printf
- "tesseract '%s' '%s' pdf"
- (tmp </> input)
- (tmp </> takeBaseName input)
- )
- pure (takeBaseName input <.> "pdf")
+ T.decodeUtf8 . LB.toStrict
+ <$> sh
+ ( printf
+ "tesseract '%s' '%s' pdf"
+ (tmp </> input)
+ (tmp </> takeBaseName input)
+ )
-data Attrs = Attrs
+data Index = Index
{ originalText :: T.Text
}
deriving (Show, Generic, Eq)
-instance J.ToJSON Attrs
+instance J.ToJSON Index
-instance J.FromJSON Attrs
+instance J.FromJSON Index
data PdfInfo = PdfInfo
{ numPages :: Int,
diff --git a/default.nix b/default.nix
index 16d73b0..399e153 100644
--- a/default.nix
+++ b/default.nix
@@ -38,6 +38,8 @@ rec {
withHaddock = true;
shellHook = ''
HISTFILE=${pkgs.lib.escapeShellArg ./.}/.history; export HISTFILE
+
+ APAPERLESS_STORE_DIR=${pkgs.lib.escapeShellArg ./.}/store; export APAPERLESS_STORE_DIR
'';
};
}