{-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE NoFieldSelectors #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Main where import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (forM, unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.Default import Data.Digest.Pure.SHA (sha1, showDigest) import Data.Function ((&)) import Data.List import Data.List.NonEmpty qualified as N import Data.Map qualified as M import Data.Maybe (catMaybes, fromMaybe, mapMaybe) import Data.Ord (comparing) import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T import Data.Time.Clock (UTCTime, getCurrentTime) import Data.Time.Format.ISO8601 (iso8601Show) import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) import GHC.Records (HasField (..)) import Options.Applicative qualified as O import Prompt qualified as P import Settings qualified as S import System.Directory import System.Environment (getEnv) import System.FilePath import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Tag import Text.Printf (printf) import Text.RE.TDFA.Text qualified as R import Text.Read (readMaybe) data Args = Args { cmd :: Cmd } data Cmd = Consume { keep :: Bool, inputs :: [FilePath], prompt :: Bool, force :: Bool } | Edit { indexNames :: [FilePath] } | List { filters :: [Filter], todo :: Bool, view :: Bool, redo :: Bool, edit :: Bool } | Todo | View { indexNames :: [FilePath] } args :: O.Parser Args args = Args <$> cmd cmd :: O.Parser Cmd cmd = O.hsubparser . mconcat $ [ O.command "consume" . O.info consumeCmd $ O.progDesc "Consume document(s)", O.command "edit" . O.info editCmd $ O.progDesc "Edit document(s)", O.command "list" . O.info listCmd $ O.progDesc "List document(s)", O.command "todo" . O.info todoCmd $ O.progDesc "Interactively process new documents", O.command "view" . O.info viewCmd $ O.progDesc "View document(s)" ] consumeCmd :: O.Parser Cmd consumeCmd = Consume <$> keepArg <*> inputsArg <*> promptArg <*> forceArg editCmd :: O.Parser Cmd editCmd = Edit <$> indexNamesArg listCmd :: O.Parser Cmd listCmd = List <$> filtersArg <*> todoArg <*> viewArg <*> redoArg <*> editArg todoCmd :: O.Parser Cmd todoCmd = pure Todo viewCmd :: O.Parser Cmd viewCmd = View <$> indexNamesArg inputsArg :: O.Parser [FilePath] inputsArg = O.many ( O.strArgument ( O.metavar "FILE" <> O.action "file" ) ) keepArg :: O.Parser Bool keepArg = O.switch ( O.long "keep" <> O.short 'k' <> O.help "Keep input document" ) promptArg :: O.Parser Bool promptArg = O.switch ( O.long "prompt" <> O.short 'p' <> O.help "Prompt after consuming document(s)" ) forceArg :: O.Parser Bool forceArg = O.switch ( O.long "force" <> O.short 'f' <> O.help "Force operation, overriding safety checks" ) filtersArg :: O.Parser [Filter] filtersArg = O.many $ O.option (O.maybeReader parse) ( O.long "filter" <> O.short 'f' <> O.help "Filter documents by tag" ) where parse ('@' : tagKey) = Just (Filter Include (FilterByTag (T.pack tagKey))) parse ('!' : '@' : tagKey) = Just (Filter Exclude (FilterByTag (T.pack tagKey))) parse _ = Nothing indexNamesArg :: O.Parser [FilePath] indexNamesArg = O.many $ O.strArgument ( O.metavar "ID" <> O.action "file" ) todoArg :: O.Parser Bool todoArg = O.switch ( O.long "todo" <> O.help "Run command `todo` on listed document(s)" ) redoArg :: O.Parser Bool redoArg = O.switch ( O.long "redo" <> O.help "Redo OCR on listed document(s)" ) editArg :: O.Parser Bool editArg = O.switch ( O.long "edit" <> O.help "Run command `edit` on listed document(s)" ) viewArg :: O.Parser Bool viewArg = O.switch ( O.long "view" <> O.help "Run command `view` on listed document(s)" ) data Filter = Filter Mode SimpleFilter data Mode = Include | Exclude data SimpleFilter = FilterByTag T.Text main :: IO () main = do settings <- S.readSettings S.writeSettings "apaperless.yaml" settings cwd <- getCurrentDirectory setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR" ensureGit ensureDir "originals" ensureDir "index" O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case Args {cmd = Consume {keep, inputs, force, prompt}} -> do indexNames <- parMapM (consume1 force keep) (map (cwd ) inputs) allDocs <- getDocuments docs <- mapM (readDocument . (<.> "json")) indexNames docs' <- if prompt then processDocumentsInteractively settings allDocs docs else processDocuments settings allDocs docs mapM_ ( \doc -> do printf "%s\n" (takeBaseName doc.iFileName) printTags doc ) docs' Args {cmd = Edit {indexNames}} -> do editDocuments =<< mapM (readDocument . (<.> "json")) indexNames Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do doRedoIf filters redo mapM_ ( \doc -> do printf "%s\n" (takeBaseName doc.iFileName) printTags doc ) . applyFilters filters =<< getDocuments Args {cmd = List {filters, redo, edit = True}} -> do doRedoIf filters redo editDocuments . applyFilters filters =<< getDocuments Args {cmd = List {filters, redo, todo = True}} -> do doRedoIf filters redo allDocs <- getDocuments _ <- processDocumentsInteractively settings allDocs . applyFilters filters $ allDocs pure () Args {cmd = Todo} -> do allDocs <- getDocuments _ <- processDocumentsInteractively settings allDocs . applyFilters [Filter Include (FilterByTag "todo")] $ allDocs pure () Args {cmd = List {filters, redo, view = True}} -> do doRedoIf filters redo viewDocuments . applyFilters filters =<< getDocuments Args {cmd = View {indexNames}} -> do viewDocuments =<< mapM (readDocument . (<.> "json")) indexNames printTags :: Document -> IO () printTags doc = mapM_ ( \tag -> case tagValue tag of Nothing -> printf "@%s\n" (tagKey tag) Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue ) (doc.index.tags `S.union` doc.index.internalTags) doRedoIf :: [Filter] -> Bool -> IO () doRedoIf filters redo = when redo do parMapM_ doRedo . applyFilters filters =<< getDocuments where doRedo doc = do originalText <- ocr doc.oFilePath withGit do J.encodeFile doc.iFilePath doc.index {originalText = originalText} commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) data Document = Document { iFileName :: String, index :: Index } deriving (Show) instance HasField "oFilePath" Document FilePath where getField doc = "originals" takeBaseName doc.iFileName <.> "pdf" instance HasField "iFilePath" Document FilePath where getField doc = "index" doc.iFileName tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) tagValues docs = M.unionsWith S.union $ mapMaybe ( \(Tag tagKey tagValue) -> M.singleton tagKey . S.singleton <$> tagValue ) (S.toList (S.unions (map (.index.tags) docs))) getDocuments :: IO [Document] getDocuments = parMapM readDocument =<< sort <$> listDirectory "index" readDocument :: FilePath -> IO Document readDocument iFileName = Document iFileName <$> decodeFile @Index ("index" iFileName) applyFilters :: [Filter] -> [Document] -> [Document] applyFilters filters = filter (pred filters) `at` (.index.internalTags) where pred1 (Filter Include filter') = pred1' filter' pred1 (Filter Exclude filter') = not . pred1' filter' pred1' (FilterByTag tagKey) = hasTag (Tag tagKey Nothing) pred filters = \index -> all ($ index) (map pred1 filters) at :: ([a] -> [a]) -> (b -> a) -> [b] -> [b] at _ _ [] = [] at g f (x : xs) | null (g [f x]) = at g f xs | otherwise = x : at g f xs processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document] processDocumentsInteractively settings allDocs docs = mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs) where numDocs = length docs tagValues' = tagValues docs processDocumentInteractively n doc = do choice <- P.prompt ( P.choice ( printf "%s\n%s\n\n(%d/%d) Process this document?" (takeBaseName doc.iFileName) doc.index.shortText n numDocs ) (("f" :: String) N.:| ["p", "s", "v"]) & P.help ( \a -> case a of "f" -> "view full text" "p" -> "process document" "s" -> "skip document" "v" -> "view document" ) ) case choice of "f" -> do printf "%s\n" (takeBaseName doc.iFileName) printf "%s\n" doc.index.originalText processDocumentInteractively n doc "p" -> tagDocumentInteractively settings allDocs doc "s" -> pure doc "v" -> do viewDocuments [doc] processDocumentInteractively n doc viewDocuments :: [Document] -> IO () viewDocuments docs = sh_ ( "zathura " <> ( intercalate " " (map (\doc -> printf "'%s'" doc.oFilePath) docs) ) ) editDocuments :: [Document] -> IO () editDocuments docs = withSystemTempDirectory "apaperless" $ \tmp -> do let fp doc = tmp takeBaseName doc.iFileName <.> "txt" mapM_ ( \doc -> T.writeFile (fp doc) doc.index.originalText ) docs sh_ ( "vim " <> ( intercalate " " (map (\doc -> printf "'%s'" (fp doc)) docs) ) ) mapM_ ( \doc -> do originalText <- T.readFile (fp doc) withGit do J.encodeFile doc.iFilePath doc.index {originalText = originalText} commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath)) pure originalText ) docs suggestTags :: S.Settings -> [Document] -> Document -> IO [(Tag, [Tag])] suggestTags settings allDocs doc = do forM settings.suggestedTags $ \suggestedTag -> do case suggestedTag of S.SuggestTagByRE tagName pattern -> do let tagValues = nub . catMaybes . map R.matchedText . R.allMatches $ doc.index.originalText R.*=~ pattern pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues) S.SuggestTagByTags tagName -> do let allTags = foldl S.union S.empty (map (.index.tags) allDocs) allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) hasWord word doc = S.member word doc.index.originalWords hasTag tag doc = S.member tag doc.index.tags hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc fi = fromIntegral -- TODO Consider words that contribute to NOT adding a tag -- -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well. -- -- @topic probability-map -- TODO Cache `probabilityMap` probabilityMap = [ let docs = filter (hasWordAndTag word tag) allDocs p = fi (length docs) / fi (length allDocs) in (word, tag, p) | word <- S.toList allWords, tag <- S.toList allTags ] let tagValues = probabilityMap & filter (\(word, tag, _) -> hasWordAndTag word tag doc) & foldl' ( \scorePerTagValue (_, tag, p) -> M.insertWith (+) (tagValue tag) p scorePerTagValue ) M.empty & M.toList & sortBy (comparing (negate . snd)) & map fst pure (Tag tagName (Just ""), map (Tag tagName) tagValues) autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag] autoApplySuggestedTags = mapMaybe ( \(_, suggestedTags) -> if null suggestedTags then Nothing else Just (head suggestedTags) ) processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document] processDocuments settings allDocs docs = mapM processDocument docs where processDocument doc = do tags <- S.fromList . autoApplySuggestedTags <$> suggestTags settings allDocs doc let doc' = doc { index = doc.index { tags = doc.index.tags `S.union` tags, todo = False } } withGit do J.encodeFile doc.iFilePath doc'.index commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath) pure doc' tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document tagDocumentInteractively settings allDocs doc = do suggestedTags <- suggestTags settings allDocs doc tags <- S.fromList . catMaybes <$> mapM (uncurry tagDocumentInteractively') suggestedTags let doc' = doc { index = doc.index { tags = doc.index.tags `S.union` tags, todo = False } } withGit do J.encodeFile doc.iFilePath doc'.index commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath) pure doc' where tagDocumentInteractively' :: Tag -> [Tag] -> IO (Maybe Tag) tagDocumentInteractively' tag@(Tag tagKey Nothing) tags = do choice <- P.prompt $ P.choice (printf "tag with %s?" tagKey) (("n" :: String) N.:| ["y"]) pure $ if (choice == "y") then Just tag else Nothing tagDocumentInteractively' (Tag tagKey (Just _)) tags = do tagValue <- fmap T.pack . P.prompt $ P.string (printf "tag with %s?" tagKey) (map T.unpack $ mapMaybe tagValue tags) pure $ if not (T.null tagValue) then Just (Tag tagKey (Just tagValue)) else Nothing ensureGit :: IO () ensureGit = do doesExist <- doesDirectoryExist ".git" unless doesExist $ sh_ ">/dev/null git init --initial-branch main" ensureDir :: FilePath -> IO () ensureDir dirName = createDirectoryIfMissing False dirName debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x fileKey :: FilePath -> IO FilePath fileKey filePath = showDigest . sha1 <$> LB.readFile filePath consume1 :: Bool -> Bool -> FilePath -> IO FilePath consume1 force keep filePath = do fKey <- fileKey filePath let oFilePath = "originals" fKey <.> takeExtension filePath originalExists <- doesFileExist oFilePath when (originalExists && not force) do error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath) let iFilePath = "index" fKey <.> "json" originalText <- do originalText' <- T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext -layout '%s' -" filePath) let hasText = (not . T.null) . T.strip $ originalText' if not hasText then ocr filePath else pure originalText' addedAt <- getCurrentTime withGit do J.encodeFile iFilePath Index {tags = S.empty, todo = True, ..} if keep then copyFile filePath oFilePath else renameFile filePath oFilePath commitAll [iFilePath, oFilePath] (printf "add %s" (takeFileName filePath)) pure (takeBaseName iFilePath) withGit :: IO a -> IO a withGit = withLockFile def ".gitlock" commitAll :: [FilePath] -> String -> IO () commitAll fps m = do sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps)) sh_ (printf ">/dev/null git commit -m '%s' || :" m) data DecodeException = DecodeException FilePath String deriving (Show) instance Exception DecodeException decodeFile :: J.FromJSON a => FilePath -> IO a decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp ocr :: FilePath -> IO T.Text ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) let isScan = length pdfImages == pdfInfo.numPages && all ((pdfInfo.pageSize ==) . imageSize) pdfImages if isScan then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) imageFiles <- sort <$> listDirectory tmp -- XXX add DPI information to image so that resulting pdf preserves DPI parMapM_ ( \(pdfImage, imageFile) -> sh_ ( printf "convert -density %dx%d -units PixelsPerInch '%s' '%s'" pdfImage.xPpi pdfImage.yPpi (tmp imageFile) (tmp imageFile) ) ) (zip pdfImages imageFiles) T.unlines <$> mapM (ocr1 tmp . (tmp )) imageFiles ocr1 :: FilePath -> FilePath -> IO T.Text ocr1 tmp input = T.decodeUtf8 . LB.toStrict -- XXX `--oem 1` seems to be unavailable <$> sh (printf "tesseract '%s' - -l deu+eng --oem 3 --psm 1" (tmp input)) data Index = Index { originalText :: T.Text, tags :: S.Set Tag, addedAt :: UTCTime, todo :: Bool } deriving (Show, Generic, Eq) instance J.ToJSON Index instance J.FromJSON Index instance HasField "shortText" Index T.Text where getField = (T.unlines . take 10 . T.lines) . (.originalText) instance HasField "internalTags" Index (S.Set Tag) where getField index = index.tags `S.union` internalTags index -- TODO Cache `originalWords` -- -- @related cache-probabilitymap instance HasField "originalWords" Index (S.Set T.Text) where getField index = S.fromList (T.words index.originalText) internalTags :: Index -> S.Set Tag internalTags index = S.fromList ( concat [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))], if index.todo then [Tag "todo" Nothing] else [] ] ) data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) } deriving (Show) data PdfInfoException = PdfInfoException deriving (Show) instance Exception PdfInfoException parsePdfInfo :: LB.ByteString -> PdfInfo parsePdfInfo out' = fromMaybe (throw PdfInfoException) $ do numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs pageSize <- rightToMaybe . A.parseOnly pageSizeParser =<< M.lookup "Page size" kvs pure PdfInfo {..} where out = T.decodeUtf8 (LB.toStrict out') kvs = M.fromList . map (second T.stripStart) . map (second T.tail . T.break (== ':')) . filter (not . T.null) . T.lines $ out pageSizeParser = (,) <$> (A.double <* A.string " x ") <*> (A.double <* A.string " pts (A4)") <* A.endOfInput type PdfImages = [PdfImage] data PdfImage = PdfImage { page :: Int, num :: Int, type_ :: String, width :: Int, height :: Int, color :: String, comp :: Int, bpc :: Int, enc :: String, interp :: String, object :: Int, id :: Int, xPpi :: Int, yPpi :: Int, size :: String, ratio :: String } deriving (Show) imageSize :: PdfImage -> (Double, Double) imageSize (PdfImage {..}) = let f ppi p = 72 * fromIntegral p / fromIntegral ppi in (f xPpi width, f yPpi height) data PdfImagesException = PdfImagesException deriving (Show) instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException parsePdfImages :: LB.ByteString -> PdfImages parsePdfImages out' = map ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> PdfImage { page = read page', num = read num', width = read width', height = read height', comp = read comp', bpc = read bpc', object = read object', id = read id', xPpi = read xPpi', yPpi = read yPpi', .. } ) . map (map T.unpack) . map T.words . drop 2 . filter (not . T.null) . T.lines $ out where out = T.decodeUtf8 (LB.toStrict out') sh :: String -> IO LB.ByteString sh cmd = do -- printf "+ %s\n" cmd (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err sh_ :: String -> IO () sh_ cmd = do -- printf "+ %s\n" cmd (exitCode, err) <- readProcessStderr (fromString cmd) case exitCode of ExitSuccess -> return () ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just parMapM :: (a -> IO b) -> [a] -> IO [b] parMapM f xs = do n <- getNumProcessors withPool n $ \pool -> parallel pool (map f xs) parMapM_ :: (a -> IO b) -> [a] -> IO () parMapM_ f = fmap (const ()) . parMapM f