{-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE NoFieldSelectors #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Main where import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.Default import Data.Digest.Pure.SHA (sha256, showDigest) import Data.List import Data.Map qualified as M import Data.Maybe (catMaybes, fromMaybe) import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) import GHC.Records (HasField (..)) import Options.Applicative qualified as O import System.Directory import System.Environment (getEnv) import System.FilePath import System.IO import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) import Text.Read (readMaybe) data Args = Args { cmd :: Cmd } data Cmd = Consume { keep :: Bool, inputs :: [FilePath] } | List { filters :: [Filter], todo :: Bool, view :: Bool, redo :: Bool } | Todo | View { indexNames :: [FilePath] } args :: O.Parser Args args = Args <$> cmd cmd :: O.Parser Cmd cmd = O.hsubparser . mconcat $ [ O.command "consume" . O.info consumeCmd $ O.progDesc "Consume document(s)", O.command "list" . O.info listCmd $ O.progDesc "List document(s)", O.command "todo" . O.info todoCmd $ O.progDesc "Interactively process new documents", O.command "view" . O.info viewCmd $ O.progDesc "View document(s)" ] consumeCmd :: O.Parser Cmd consumeCmd = Consume <$> keepArg <*> inputsArg listCmd :: O.Parser Cmd listCmd = List <$> filtersArg <*> todoArg <*> viewArg <*> redoArg todoCmd :: O.Parser Cmd todoCmd = pure Todo viewCmd :: O.Parser Cmd viewCmd = View <$> indexNamesArg inputsArg :: O.Parser [FilePath] inputsArg = O.many ( O.strArgument ( O.metavar "FILE" <> O.action "file" ) ) keepArg :: O.Parser Bool keepArg = O.switch ( O.long "keep" <> O.help "Keep input document" ) filtersArg :: O.Parser [Filter] filtersArg = O.many $ O.option (O.maybeReader parse) ( O.long "filter" <> O.short 'f' <> O.help "Filter documents by tag" ) where parse ('@' : tagKey) = Just (FilterByTag (T.pack tagKey)) parse _ = Nothing indexNamesArg :: O.Parser [FilePath] indexNamesArg = O.many $ O.strArgument ( O.metavar "ID" <> O.action "file" ) todoArg :: O.Parser Bool todoArg = O.switch ( O.long "todo" <> O.help "Run command `todo` on listed document(s)" ) redoArg :: O.Parser Bool redoArg = O.switch ( O.long "redo" <> O.help "Redo OCR on listed document(s)" ) viewArg :: O.Parser Bool viewArg = O.switch ( O.long "view" <> O.help "Run command `view` on listed document(s)" ) data Filter = FilterByTag T.Text main :: IO () main = do cwd <- getCurrentDirectory setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR" ensureGit ensureDir "originals" ensureDir "index" O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case Args {cmd = Consume {keep, inputs}} -> mapM_ putStrLn =<< parMapM (consume1 keep) (map (cwd ) inputs) Args {cmd = List {filters, redo, todo = False, view = False}} -> do doRedoIf filters redo mapM_ ( \(Document {iFileName, index}) -> do if hasTag (Tag "todo" Nothing) index then printf "TODO %s\n" (takeBaseName iFileName) else printf " %s\n" (takeBaseName iFileName) ) . applyFilters filters =<< getDocuments Args {cmd = List {filters, redo, todo = True}} -> do doRedoIf filters redo processDocuments . applyFilters filters =<< getDocuments Args {cmd = Todo} -> do processDocuments . applyFilters [FilterByTag "todo"] =<< getDocuments Args {cmd = List {filters, redo, view = True}} -> do doRedoIf filters redo viewDocuments . applyFilters filters =<< getDocuments Args {cmd = View {indexNames}} -> do viewDocuments =<< mapM (readDocument . (<.> "json")) indexNames doRedoIf :: [Filter] -> Bool -> IO () doRedoIf filters redo = when redo do parMapM_ doRedo . applyFilters filters =<< getDocuments where doRedo doc = do originalText <- ocr doc.oFilePath withGit do J.encodeFile doc.iFilePath doc.index {originalText = originalText} commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) data Document = Document { iFileName :: String, index :: Index } deriving (Show) instance HasField "oFilePath" Document FilePath where getField doc = "originals" takeBaseName doc.iFileName <.> "pdf" instance HasField "iFilePath" Document FilePath where getField doc = "index" doc.iFileName getDocuments :: IO [Document] getDocuments = parMapM readDocument =<< sort <$> listDirectory "index" readDocument :: FilePath -> IO Document readDocument iFileName = Document iFileName <$> decodeFile @Index ("index" iFileName) applyFilters :: [Filter] -> [Document] -> [Document] applyFilters filters = filter (pred filters) `at` (.index) where pred1 (FilterByTag tagKey) = hasTag (Tag tagKey Nothing) pred filters = \index -> all ($ index) (map pred1 filters) at :: ([a] -> [a]) -> (b -> a) -> [b] -> [b] at _ _ [] = [] at g f (x : xs) | null (g [f x]) = at g f xs | otherwise = x : at g f xs processDocuments :: [Document] -> IO () processDocuments docs = mapM_ (uncurry processDocuments') (zip [1 :: Int ..] docs) where numDocs = length docs processDocuments' n (doc@Document {iFileName, index}) = do choice <- promptChoiceHelp [ ("f", "view full text"), ("p", "process document"), ("s", "skip document"), ("v", "view document") ] ( printf "%s\n%s\n\n(%d/%d) Process this document?" (takeBaseName iFileName) index.shortText n numDocs ) case choice of "f" -> do printf "%s\n" (takeBaseName doc.iFileName) printf "%s\n" ( T.unlines . filter (not . T.null) . map T.strip . T.lines $ doc.index.originalText ) processDocuments' n doc "p" -> processDocument doc "s" -> pure () "v" -> do viewDocuments [doc] processDocuments' n doc viewDocuments :: [Document] -> IO () viewDocuments docs = sh_ ( "zathura " <> ( intercalate " " (map (\doc -> printf "'%s'" doc.oFilePath) docs) ) ) processDocument :: Document -> IO () processDocument (Document {iFileName, index}) = do printf "%s\n" index.originalText let suggestedTags = [ Tag "correspondent" (Just ""), Tag "invoice" Nothing, Tag "bill" Nothing ] tags <- S.fromList . catMaybes <$> mapM processSuggestedTag suggestedTags let tags' = S.delete (Tag "todo" Nothing) (index.tags `S.union` tags) index' = index {tags = tags'} iFilePath = "index" iFileName withGit do J.encodeFile iFilePath index' commitAll [iFilePath] (printf "process %s (interactive)" iFilePath) processSuggestedTag :: Tag -> IO (Maybe Tag) processSuggestedTag tag@(Tag tagKey Nothing) = do choice <- promptChoice (Just "n") ["n", "y"] (printf "tag with %s?" tagKey) pure $ if (choice == "y") then Just tag else Nothing processSuggestedTag (Tag tagKey (Just _)) = do tagValue <- promptString [] (printf "tag with %s?" tagKey) pure $ if not (T.null tagValue) then Just (Tag tagKey (Just tagValue)) else Nothing promptChoice :: Maybe T.Text -> [T.Text] -> String -> IO T.Text promptChoice mDef as s = do a <- T.toLower <$> promptString [] ( s ++ ( T.unpack ( " [" <> T.intercalate "" (map capitalizeDef as) <> "]" ) ) ) case (a, mDef) of ("", Just def) -> pure def _ -> if not (T.toLower a `elem` map T.toLower as) then promptChoice mDef as s else pure a where capitalizeDef a = (if Just a == mDef then T.toUpper else T.toLower) a promptChoiceHelp :: [(T.Text, T.Text)] -> String -> IO T.Text promptChoiceHelp as' s = do a <- T.toLower <$> promptString [] ( s ++ ( T.unpack (" [" <> T.intercalate "" (as ++ ["?"]) <> "]") ) ) if a == "?" then do printHelp promptChoiceHelp as' s else if not (T.toLower a `elem` map T.toLower as) then promptChoiceHelp as' s else pure a where as = map fst as' printHelp = mapM_ (uncurry (printf "%s - %s\n")) as' promptString :: [T.Text] -> String -> IO T.Text promptString as s = do if null as then do putStr (s <> "> ") else do putStrLn s mapM_ (\(n, a) -> printf "[%d] %s\n" n a) (zip [1 :: Int ..] as) putStr "> " hFlush stdout a <- T.strip <$> T.getLine case (as, readMaybe (T.unpack a)) of ((_ : _), Just n) -> case drop (n - 1) as of [] -> promptString as s (a' : _) -> pure a' _ -> pure a ensureGit :: IO () ensureGit = do doesExist <- doesDirectoryExist ".git" unless doesExist $ sh_ "git init --initial-branch main" ensureDir :: FilePath -> IO () ensureDir dirName = createDirectoryIfMissing False dirName debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x fileKey :: FilePath -> IO FilePath fileKey filePath = showDigest . sha256 <$> LB.readFile filePath consume1 :: Bool -> FilePath -> IO FilePath consume1 keep filePath = do fKey <- fileKey filePath let oFilePath = "originals" fKey <.> takeExtension filePath originalExists <- doesFileExist oFilePath when originalExists do error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath) let iFilePath = "index" fKey <.> "json" originalText <- do originalText' <- T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext '%s' -" filePath) let hasText = (not . T.null) . T.strip $ originalText' if not hasText then ocr filePath else pure originalText' withGit do J.encodeFile iFilePath Index {tags = S.singleton (Tag "todo" Nothing), ..} if keep then copyFile filePath oFilePath else renameFile filePath oFilePath commitAll [iFilePath, oFilePath] (printf "add %s" (takeFileName filePath)) pure (takeBaseName iFilePath) withGit :: IO a -> IO a withGit = withLockFile def ".gitlock" commitAll :: [FilePath] -> String -> IO () commitAll fps m = do sh_ ("git add -- " ++ intercalate " " (map (printf "'%s'") fps)) sh_ (printf "git commit -m '%s' || :" m) data DecodeException = DecodeException FilePath String deriving (Show) instance Exception DecodeException decodeFile :: J.FromJSON a => FilePath -> IO a decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp ocr :: FilePath -> IO T.Text ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) let isScan = length pdfImages == pdfInfo.numPages && all ((pdfInfo.pageSize ==) . imageSize) pdfImages if isScan then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) imageFiles <- sort <$> listDirectory tmp -- XXX add DPI information to image so that resulting pdf preserves DPI parMapM_ ( \(pdfImage, imageFile) -> sh_ ( printf "convert -density %dx%d -units PixelsPerInch '%s' '%s'" pdfImage.xPpi pdfImage.yPpi (tmp imageFile) (tmp imageFile) ) ) (zip pdfImages imageFiles) T.unlines <$> mapM (ocr1 tmp . (tmp )) imageFiles ocr1 :: FilePath -> FilePath -> IO T.Text ocr1 tmp input = T.decodeUtf8 . LB.toStrict -- XXX `--oem 1` seems to be unavailable <$> sh (printf "tesseract '%s' - -l deu+eng --oem 3 --psm 1" (tmp input)) data Index = Index { originalText :: T.Text, tags :: S.Set Tag } deriving (Show, Generic, Eq) instance J.ToJSON Index instance J.FromJSON Index instance HasField "shortText" Index T.Text where getField = T.unlines . take 10 . filter (not . T.null) . map T.strip . T.lines . (.originalText) data Tag = Tag T.Text (Maybe T.Text) deriving (Show, Generic, Eq, Ord) tagKey :: Tag -> T.Text tagKey (Tag x _) = x tagValue :: Tag -> Maybe T.Text tagValue (Tag _ x) = x hasTag :: Tag -> Index -> Bool hasTag tag = (tagKey tag `S.member`) . S.map tagKey . (.tags) instance J.ToJSON Tag instance J.FromJSON Tag data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) } deriving (Show) data PdfInfoException = PdfInfoException deriving (Show) instance Exception PdfInfoException parsePdfInfo :: LB.ByteString -> PdfInfo parsePdfInfo out' = fromMaybe (throw PdfInfoException) $ do numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs pageSize <- rightToMaybe . A.parseOnly pageSizeParser =<< M.lookup "Page size" kvs pure PdfInfo {..} where out = T.decodeUtf8 (LB.toStrict out') kvs = M.fromList . map (second T.stripStart) . map (second T.tail . T.break (== ':')) . filter (not . T.null) . T.lines $ out pageSizeParser = (,) <$> (A.double <* A.string " x ") <*> (A.double <* A.string " pts (A4)") <* A.endOfInput type PdfImages = [PdfImage] data PdfImage = PdfImage { page :: Int, num :: Int, type_ :: String, width :: Int, height :: Int, color :: String, comp :: Int, bpc :: Int, enc :: String, interp :: String, object :: Int, id :: Int, xPpi :: Int, yPpi :: Int, size :: String, ratio :: String } deriving (Show) imageSize :: PdfImage -> (Double, Double) imageSize (PdfImage {..}) = let f ppi p = 72 * fromIntegral p / fromIntegral ppi in (f xPpi width, f yPpi height) data PdfImagesException = PdfImagesException deriving (Show) instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException parsePdfImages :: LB.ByteString -> PdfImages parsePdfImages out' = map ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> PdfImage { page = read page', num = read num', width = read width', height = read height', comp = read comp', bpc = read bpc', object = read object', id = read id', xPpi = read xPpi', yPpi = read yPpi', .. } ) . map (map T.unpack) . map T.words . drop 2 . filter (not . T.null) . T.lines $ out where out = T.decodeUtf8 (LB.toStrict out') sh :: String -> IO LB.ByteString sh cmd = do -- printf "+ %s\n" cmd (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err sh_ :: String -> IO () sh_ = fmap (\_ -> ()) . sh rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just parMapM :: (a -> IO b) -> [a] -> IO [b] parMapM f xs = do n <- getNumProcessors withPool n $ \pool -> parallel pool (map f xs) parMapM_ :: (a -> IO b) -> [a] -> IO () parMapM_ f = fmap (const ()) . parMapM f