diff options
-rw-r--r-- | app/Main.hs | 251 |
1 files changed, 237 insertions, 14 deletions
diff --git a/app/Main.hs b/app/Main.hs index 920d3a6..32f4037 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,8 +1,10 @@ {-# LANGUAGE BlockArguments #-} +{-# LANGUAGE DataKinds #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE NoFieldSelectors #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} @@ -17,9 +19,11 @@ import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.Default import Data.Digest.Pure.SHA (sha256, showDigest) +import Data.Function ((&)) import Data.List import Data.Map qualified as M -import Data.Maybe (fromMaybe) +import Data.Maybe (catMaybes, fromMaybe) +import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T @@ -27,10 +31,12 @@ import Data.Text.IO qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) +import GHC.Records (HasField (..)) import Options.Applicative qualified as O import System.Directory import System.Environment (getEnv) import System.FilePath +import System.IO import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed @@ -43,7 +49,8 @@ data Args = Args data Cmd = Consume Bool [FilePath] - | List + | List [Filter] + | Todo args :: O.Parser Args args = @@ -55,7 +62,9 @@ cmd = [ O.command "consume" . O.info consumeCmd $ O.progDesc "Consume document(s)", O.command "list" . O.info listCmd $ - O.progDesc "List document(s)" + O.progDesc "List document(s)", + O.command "todo" . O.info todoCmd $ + O.progDesc "Interactively process new documents" ] consumeCmd :: O.Parser Cmd @@ -66,7 +75,12 @@ consumeCmd = listCmd :: O.Parser Cmd listCmd = - pure List + List + <$> filterArg + +todoCmd :: O.Parser Cmd +todoCmd = + pure Todo filePathsArg :: O.Parser [FilePath] filePathsArg = @@ -79,6 +93,22 @@ keepArg = <> O.help "Keep input document" ) +filterArg :: O.Parser [Filter] +filterArg = + O.many $ + O.option + (O.maybeReader parse) + ( O.long "filter" + <> O.short 'f' + <> O.help "Filter documents by tag" + ) + where + parse ('@' : tagKey) = Just (FilterByTag (T.pack tagKey)) + parse _ = Nothing + +data Filter + = FilterByTag T.Text + main :: IO () main = do cwd <- getCurrentDirectory @@ -91,17 +121,183 @@ main = do Args {cmd = Consume keep filePaths} -> mapM_ putStrLn =<< parMapM (consume1 keep) (map (cwd </>) filePaths) - Args {cmd = List} -> do + Args {cmd = List filters} -> do mapM_ - ( \(iFileName, index) -> do - putStrLn (takeBaseName iFileName) - T.putStrLn index.originalText + ( \(Document {iFileName, index}) -> do + if hasTag (Tag "todo" Nothing) index + then printf "TODO %s\n" (takeBaseName iFileName) + else printf " %s\n" (takeBaseName iFileName) ) - =<< parMapM - ( \iFileName -> - (,) iFileName <$> decodeFile @Index ("index" </> iFileName) + . applyFilters filters + =<< getDocuments + Args {cmd = Todo} -> do + processDocuments + . applyFilters [FilterByTag "todo"] + =<< getDocuments + +data Document = Document + { iFileName :: String, + index :: Index + } + deriving (Show) + +instance HasField "oFilePath" Document FilePath where + getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf" + +instance HasField "iFilePath" Document FilePath where + getField doc = "index" </> doc.iFileName + +getDocuments :: IO [Document] +getDocuments = + parMapM + ( \iFileName -> + Document iFileName + <$> decodeFile @Index ("index" </> iFileName) + ) + =<< sort <$> listDirectory "index" + +applyFilters :: [Filter] -> [Document] -> [Document] +applyFilters filters = filter (pred filters) `at` (.index) + where + pred1 (FilterByTag tagKey) = hasTag (Tag tagKey Nothing) + pred filters = \index -> all ($ index) (map pred1 filters) + + at :: ([a] -> [a]) -> (b -> a) -> [b] -> [b] + at _ _ [] = [] + at g f (x : xs) + | null (g [f x]) = at g f xs + | otherwise = x : at g f xs + +processDocuments :: [Document] -> IO () +processDocuments docs = + mapM_ (uncurry processDocuments') (zip [1 :: Int ..] docs) + where + numDocs = length docs + processDocuments' n (doc@Document {iFileName, index}) = do + choice <- + promptChoiceHelp + [ ("f", "view full text"), + ("p", "process document"), + ("s", "skip document"), + ("v", "view document") + ] + ( printf + "%s\n%s\n\n(%d/%d) Process this document?" + (takeBaseName iFileName) + index.shortText + n + numDocs ) - =<< sort <$> listDirectory "index" + case choice of + "f" -> do + printf "%s\n" (takeBaseName doc.iFileName) + printf + "%s\n" + ( T.unlines + . filter (not . T.null) + . map T.strip + . T.lines + $ doc.index.originalText + ) + processDocuments' n doc + "p" -> processDocument doc + "s" -> pure () + "v" -> do + sh_ (printf "zathura '%s'" doc.oFilePath) + processDocuments' n doc + +processDocument :: Document -> IO () +processDocument (Document {iFileName, index}) = do + printf "%s\n" index.originalText + let suggestedTags = + [ Tag "correspondent" (Just ""), + Tag "invoice" Nothing, + Tag "bill" Nothing + ] + tags <- + S.fromList . catMaybes + <$> mapM processSuggestedTag suggestedTags + let tags' = S.delete (Tag "todo" Nothing) (index.tags `S.union` tags) + index' = index {tags = tags'} + iFilePath = "index" </> iFileName + withGit do + J.encodeFile iFilePath index' + commitAll [iFilePath] (printf "process %s (interactive)" iFilePath) + +processSuggestedTag :: Tag -> IO (Maybe Tag) +processSuggestedTag tag@(Tag tagKey Nothing) = do + choice <- promptChoice (Just "n") ["n", "y"] (printf "tag with %s?" tagKey) + pure $ if (choice == "y") then Just tag else Nothing +processSuggestedTag (Tag tagKey (Just _)) = do + tagValue <- promptString [] (printf "tag with %s?" tagKey) + pure $ + if not (T.null tagValue) + then Just (Tag tagKey (Just tagValue)) + else Nothing + +promptChoice :: Maybe T.Text -> [T.Text] -> String -> IO T.Text +promptChoice mDef as s = do + a <- + T.toLower + <$> promptString + [] + ( s + ++ ( T.unpack + ( " [" + <> T.intercalate "" (map capitalizeDef as) + <> "]" + ) + ) + ) + case (a, mDef) of + ("", Just def) -> pure def + _ -> + if not (T.toLower a `elem` map T.toLower as) + then promptChoice mDef as s + else pure a + where + capitalizeDef a = (if Just a == mDef then T.toUpper else T.toLower) a + +promptChoiceHelp :: [(T.Text, T.Text)] -> String -> IO T.Text +promptChoiceHelp as' s = do + a <- + T.toLower + <$> promptString + [] + ( s + ++ ( T.unpack + (" [" <> T.intercalate "" (as ++ ["?"]) <> "]") + ) + ) + if a == "?" + then do + printHelp + promptChoiceHelp as' s + else + if not (T.toLower a `elem` map T.toLower as) + then promptChoiceHelp as' s + else pure a + where + as = map fst as' + printHelp = mapM_ (uncurry (printf "%s - %s\n")) as' + +promptString :: [T.Text] -> String -> IO T.Text +promptString as s = do + if null as + then do + putStr (s <> "> ") + else do + putStrLn s + mapM_ (\(n, a) -> printf "[%d] %s\n" n a) (zip [1 :: Int ..] as) + putStr "> " + hFlush stdout + a <- T.strip <$> T.getLine + case (as, readMaybe (T.unpack a)) of + ((_ : _), Just n) -> + case drop (n - 1) as of + [] -> promptString as s + (a' : _) -> pure a' + _ -> pure a ensureGit :: IO () ensureGit = do @@ -137,7 +333,7 @@ consume1 keep filePath = do then ocr filePath else pure originalText' withGit do - J.encodeFile iFilePath Index {..} + J.encodeFile iFilePath Index {tags = S.singleton (Tag "todo" Nothing), ..} if keep then copyFile filePath oFilePath else renameFile filePath oFilePath @@ -198,7 +394,8 @@ ocr1 tmp input = <$> sh (printf "tesseract '%s' -" (tmp </> input)) data Index = Index - { originalText :: T.Text + { originalText :: T.Text, + tags :: S.Set Tag } deriving (Show, Generic, Eq) @@ -206,6 +403,32 @@ instance J.ToJSON Index instance J.FromJSON Index +instance HasField "shortText" Index T.Text where + getField = + T.unlines + . take 10 + . filter (not . T.null) + . map T.strip + . T.lines + . (.originalText) + +data Tag = Tag T.Text (Maybe T.Text) + deriving (Show, Generic, Eq, Ord) + +tagKey :: Tag -> T.Text +tagKey (Tag x _) = x + +tagValue :: Tag -> Maybe T.Text +tagValue (Tag _ x) = x + +hasTag :: Tag -> Index -> Bool +hasTag tag = + (tagKey tag `S.member`) . S.map tagKey . (.tags) + +instance J.ToJSON Tag + +instance J.FromJSON Tag + data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) |