From 7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Thu, 11 Jan 2024 03:20:56 +0100 Subject: chore: make computing `probabilityMap` more performance --- app/Main.hs | 371 +++++++++++++++++++++--------------------------------------- 1 file changed, 131 insertions(+), 240 deletions(-) (limited to 'app/Main.hs') diff --git a/app/Main.hs b/app/Main.hs index 93fed75..936f053 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiWayIf #-} @@ -16,41 +17,42 @@ module Main where import Control.Arrow (first, second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) -import Control.Monad (forM, unless, when) +import Control.Monad (unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB -import Data.Default import Data.Digest.Pure.SHA (sha1, showDigest) import Data.Either (partitionEithers) import Data.Function ((&)) +import Data.HashMap.Internal qualified as HM import Data.List import Data.List.NonEmpty qualified as N import Data.Map qualified as M -import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList) +import Data.Maybe (fromMaybe, mapMaybe) import Data.Ord (comparing) import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T -import Data.Time.Clock (UTCTime, getCurrentTime) -import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show) +import Data.Time.Clock (getCurrentTime) +import Data.Time.Format.ISO8601 (iso8601ParseM) import Debug.Trace +import Document qualified as D import GHC.Conc (getNumProcessors) -import GHC.Generics (Generic) -import GHC.Records (HasField (..)) import Options.Applicative qualified as O +import ProbabilityMap qualified as C import Prompt qualified as P import Settings qualified as S +import Store qualified as R import System.Directory import System.Environment (getEnv) import System.FilePath -import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Tag qualified as G import Text.Printf (printf) +import Text.RE.Replace qualified as R import Text.RE.TDFA.Text qualified as R import Text.Read (readMaybe) @@ -291,13 +293,13 @@ main = do keep ) (map (cwd ) inputs) - docs <- mapM (readDocument . (<.> "json")) indexNames - allDocs <- getDocuments + docs <- mapM (D.readDocument . (<.> "json")) indexNames + probabilityCache <- C.readProbabilityCache docs' <- if - | auto -> processDocuments settings allDocs docs + | auto -> processDocuments settings probabilityCache docs -- TODO adding tags interactively through prompt does not persist them in store - | prompt -> processDocumentsInteractively settings allDocs docs + | prompt -> processDocumentsInteractively settings probabilityCache docs | otherwise -> pure docs mapM_ ( \doc -> do @@ -307,7 +309,7 @@ main = do docs' Args {cmd = Edit {indexNames}} -> do editDocuments - =<< mapM (readDocument . (<.> "json")) indexNames + =<< mapM (D.readDocument . (<.> "json")) indexNames Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do doRedoIf filters redo mapM_ @@ -316,24 +318,26 @@ main = do printTags doc ) . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments Args {cmd = List {filters, redo, edit = True}} -> do doRedoIf filters redo editDocuments . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments Args {cmd = List {filters, redo, todo = True}} -> do doRedoIf filters redo - allDocs <- getDocuments + allDocs <- D.getDocuments + probabilityCache <- C.readProbabilityCache _ <- - processDocumentsInteractively settings allDocs + processDocumentsInteractively settings probabilityCache . filter (G.applyFilters filters . (.index.tags)) $ allDocs pure () Args {cmd = Todo} -> do - allDocs <- getDocuments + allDocs <- D.getDocuments + probabilityCache <- C.readProbabilityCache _ <- - processDocumentsInteractively settings allDocs + processDocumentsInteractively settings probabilityCache . filter (G.applyFilters [G.filter G.include "todo" Nothing] . (.index.tags)) $ allDocs pure () @@ -341,12 +345,12 @@ main = do doRedoIf filters redo viewDocuments . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments Args {cmd = View {indexNames}} -> do viewDocuments - =<< mapM (readDocument . (<.> "json")) indexNames + =<< mapM (D.readDocument . (<.> "json")) indexNames Args {cmd = TopWords} -> do - allDocs <- getDocuments + probabilityCache <- C.readProbabilityCache mapM_ ( \(tag, xs) -> do print tag @@ -355,24 +359,23 @@ main = do $ map (second (sortBy (comparing (negate . snd)))) $ map (second (filter ((> 0) . snd))) $ M.toList - $ foldl - ( \wordProbabilityPerTag (word, tag, p) -> + $ M.foldlWithKey + ( \wordProbabilityPerTag (word, tag) p -> M.insertWith (++) tag [(word, p)] wordProbabilityPerTag ) M.empty - (probabilityMap allDocs) + probabilityCache.probabilityMap Args {cmd = Modify {indexNames, tags, untags}} -> do - docs <- mapM (readDocument . (<.> "json")) indexNames + docs <- mapM (D.readDocument . (<.> "json")) indexNames mapM_ - ( \doc -> do - let doc' = addTags tags (removeTags untags doc) - withGit do - J.encodeFile doc'.iFilePath doc'.index - commitAll [doc.iFilePath] (printf "tag %s" (takeBaseName doc.iFilePath)) + ( \doc -> + R.replaceDocument + (printf "tag %s" (takeBaseName doc.iFilePath)) + doc ) - docs + (map (addTags tags . removeTags untags) docs) -printTags :: Document -> IO () +printTags :: D.Document -> IO () printTags doc = mapM_ ( \tag -> @@ -380,76 +383,36 @@ printTags doc = Nothing -> printf "@%s\n" (G.tagKey tag) Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue ) - ( filter - ( \tag -> - -- TODO Handle tags hidden by default - -- - -- - Add `list-default-hide-tags` to `Settings` - -- - Add `--show-tags`, `--hide-tags` to `List` - not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag) - && (G.tagKey tag /= "content") - && (G.tagKey tag /= "importedFrom") - && (G.tagKey tag /= "language") - ) + ( filter arbitrarilySelectTag . S.toList $ doc.index.tags `S.union` doc.index.internalTags ) +arbitrarilySelectTag :: G.Tag -> Bool +arbitrarilySelectTag tag = + -- TODO Handle tags hidden by default + -- + -- - Add `list-default-hide-tags` to `Settings` + -- - Add `--show-tags`, `--hide-tags` to `List` + not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag) + && (G.tagKey tag /= "content") + && (G.tagKey tag /= "importedFrom") + && (G.tagKey tag /= "language") + doRedoIf :: [G.Filter] -> Bool -> IO () doRedoIf filters redo = when redo do parMapM_ doRedo . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments where doRedo doc = do content <- ocr doc.index.language doc.oFilePath - withGit do - J.encodeFile doc.iFilePath doc.index {content = content} - commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) - -data Document = Document - { iFileName :: String, - index :: Index - } - deriving (Show) - -instance HasField "oFilePath" Document FilePath where - getField doc = "originals" takeBaseName doc.iFileName <.> "pdf" - -instance HasField "iFilePath" Document FilePath where - getField doc = "index" doc.iFileName - -hasWord :: T.Text -> Document -> Bool -hasWord word doc = S.member word doc.index.originalWords - -hasTag :: G.Tag -> Document -> Bool -hasTag tag doc = S.member tag doc.index.tags - -hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool -hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc - -tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) -tagValues docs = - M.unionsWith S.union $ - mapMaybe - ( \tag -> - M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag) - ) - (S.toList (S.unions (map (.index.tags) docs))) + let doc' = doc {D.index = doc.index {D.content = content}} + R.replaceDocument (printf "redo %s" (takeBaseName doc.iFilePath)) doc' -getDocuments :: IO [Document] -getDocuments = - parMapM readDocument - =<< sort <$> listDirectory "index" - -readDocument :: FilePath -> IO Document -readDocument iFileName = - Document iFileName - <$> decodeFile @Index ("index" iFileName) - -processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document] -processDocumentsInteractively settings allDocs docs = +processDocumentsInteractively :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document] +processDocumentsInteractively settings probabilityCache docs = do mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs) where numDocs = length docs @@ -478,13 +441,13 @@ processDocumentsInteractively settings allDocs docs = printf "%s\n" (takeBaseName doc.iFileName) printf "%s\n" doc.index.content processDocumentInteractively n doc - "p" -> tagDocumentInteractively settings allDocs doc + "p" -> tagDocumentInteractively settings probabilityCache doc "s" -> pure doc "v" -> do viewDocuments [doc] processDocumentInteractively n doc -viewDocuments :: [Document] -> IO () +viewDocuments :: [D.Document] -> IO () viewDocuments docs = sh_ ( "zathura " @@ -494,7 +457,7 @@ viewDocuments docs = ) ) -editDocuments :: [Document] -> IO () +editDocuments :: [D.Document] -> IO () editDocuments docs = withSystemTempDirectory "apaperless" $ \tmp -> do let fp doc = tmp takeBaseName doc.iFileName <.> "txt" @@ -513,63 +476,49 @@ editDocuments docs = mapM_ ( \doc -> do content <- T.readFile (fp doc) - withGit do - J.encodeFile doc.iFilePath doc.index {content = content} - commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath)) + let doc' = doc {D.index = doc.index {D.content = content}} + R.replaceDocument (printf "edit %s" (takeBaseName doc.iFilePath)) doc' pure content ) docs -type ProbabilityMap = [(T.Text, G.Tag, Double)] - --- TODO Consider words that contribute to NOT adding a tag --- --- If there is a document that should, say, not have a `@correspondent`, we should score words that contribute to that fact as well. --- --- @topic probability-map -probabilityMap :: [Document] -> ProbabilityMap -probabilityMap allDocs = - [ let docsWithWord = filter (hasWord word) allDocs - docsWithTag = filter (hasTag tag) allDocs - docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs - p = - fi (length docsWithWordAndTag) - / fi - ( length docsWithWord - + length docsWithTag - - length docsWithWordAndTag - ) - in (word, tag, p) - | word <- S.toList allWords, - tag <- S.toList allTags - ] - where - allTags = foldl S.union S.empty (map (.index.tags) allDocs) - allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) - fi = fromIntegral @Int @Double - -suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])] -suggestTags settings allDocs doc = do - forM settings.suggestedTags $ \suggestedTag -> do +suggestTags :: S.Settings -> C.ProbabilityCache -> D.Document -> [(G.Tag, [G.Tag])] +suggestTags settings probabilityCache doc = do + flip map settings.suggestedTags $ \suggestedTag -> case suggestedTag of - S.SuggestTagByRE tagName pattern -> do + S.SuggestTagByRE tagName pattern -> let tagValues = - nub . catMaybes . map R.matchedText . R.allMatches $ - doc.index.content - R.*=~ pattern - pure (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues) - S.SuggestTagByTags tagName -> do + nub + . mapMaybe + ( -- XXX Whys is this so complicated? + \match -> do + captures <- snd <$> R.matchCaptures match + pure + ( foldl + ( \template (captureName, captureOrdinal) -> + T.replace + ("${" <> R.getCaptureName captureName <> "}") + (R.capturedText (captures !! (fromEnum captureOrdinal - 1))) + template + ) + (R.getTemplate pattern) + (HM.toList $ R.captureNames match) + ) + ) + $ R.allMatches (doc.index.content R.*=~ (R.getSearch pattern)) + in (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues) + S.SuggestTagByTags tagName -> let tagValues = - -- TODO Cache `probabilityMap` + -- TODO Cache `probabilityCache` -- -- @topic probability-map - probabilityMap allDocs - & filter - ( \(word, tag, _) -> - G.tagKey tag == tagName && hasWord word doc + probabilityCache.probabilityMap + & M.filterWithKey + ( \(word, tag) _ -> + G.tagKey tag == tagName && D.hasWord word doc ) - & foldl' - ( \scorePerTagValue (_, tag, p) -> + & M.foldlWithKey' + ( \scorePerTagValue (_, tag) p -> M.insertWith (+) (G.tagValue tag) @@ -580,7 +529,7 @@ suggestTags settings allDocs doc = do & M.toList & sortBy (comparing (negate . snd)) & map fst - pure (G.tag tagName (Just ""), map (G.tag tagName) tagValues) + in (G.tag tagName (Just ""), map (G.tag tagName) tagValues) autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag] autoApplySuggestedTags = @@ -591,28 +540,26 @@ autoApplySuggestedTags = else Just (head suggestedTags) ) -processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document] -processDocuments settings allDocs docs = +processDocuments :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document] +processDocuments settings probabilityCache docs = mapM processDocument docs where processDocument doc = do - tags <- - S.fromList . autoApplySuggestedTags - <$> suggestTags settings allDocs doc + let tags = + S.fromList . autoApplySuggestedTags $ + suggestTags settings probabilityCache doc let doc' = doc - { index = + { D.index = doc.index - { tags = doc.index.tags `S.union` tags, - todo = False + { D.tags = doc.index.tags `S.union` tags, + D.todo = False } } - withGit do - J.encodeFile doc.iFilePath doc'.index - commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath) + R.replaceDocument (printf "process %s (auto)" doc.iFilePath) doc' pure doc' -applyTags :: [Either T.Text G.Tag] -> Document -> Document +applyTags :: [Either T.Text G.Tag] -> D.Document -> D.Document applyTags tags' doc = do addTags tags (removeTags untags doc) where @@ -625,43 +572,43 @@ data AddTagException = AddTagException G.Tag instance Exception AddTagException -addTags :: [G.Tag] -> Document -> Document +addTags :: [G.Tag] -> D.Document -> D.Document addTags = flip (foldl (flip addTag)) -addTag :: G.Tag -> Document -> Document +addTag :: G.Tag -> D.Document -> D.Document addTag tag doc = if | G.tagKey tag == "todo" -> - doc {index = doc.index {todo = True}} + doc {D.index = doc.index {D.todo = True}} | G.tagKey tag == "addedAt" -> maybe (throw (AddTagException tag)) ( \addedAt -> - doc {index = doc.index {addedAt = addedAt}} + doc {D.index = doc.index {D.addedAt = addedAt}} ) (iso8601ParseM . T.unpack =<< (G.tagValue tag)) | G.tagKey tag == "modifiedAt" -> maybe (throw (AddTagException tag)) ( \modifiedAt -> - doc {index = doc.index {modifiedAt = Just modifiedAt}} + doc {D.index = doc.index {D.modifiedAt = Just modifiedAt}} ) (iso8601ParseM . T.unpack =<< (G.tagValue tag)) | G.tagKey tag == "content" -> maybe (throw (AddTagException tag)) ( \content -> - doc {index = doc.index {content = content}} + doc {D.index = doc.index {D.content = content}} ) (G.tagValue tag) | G.tagKey tag == "language" -> throw (AddTagException tag) | otherwise -> doc - { index = + { D.index = doc.index - { tags = G.replace tag doc.index.tags + { D.tags = G.replace tag doc.index.tags } } @@ -670,15 +617,15 @@ data RemoveTagException = RemoveTagException G.Tag instance Exception RemoveTagException -removeTags :: [G.Tag] -> Document -> Document +removeTags :: [G.Tag] -> D.Document -> D.Document removeTags = flip (foldl (flip removeTag)) -removeTag :: G.Tag -> Document -> Document +removeTag :: G.Tag -> D.Document -> D.Document removeTag tag doc = if | G.tagKey tag == "todo" -> - doc {index = doc.index {todo = False}} + doc {D.index = doc.index {D.todo = False}} | G.tagKey tag == "addedAt" -> throw (RemoveTagException tag) | G.tagKey tag == "modifiedAt" -> @@ -689,9 +636,9 @@ removeTag tag doc = throw (RemoveTagException tag) | otherwise -> doc - { index = + { D.index = doc.index - { tags = + { D.tags = maybe (G.deleteAll (G.tagKey tag)) (\_ -> G.delete tag) @@ -700,18 +647,16 @@ removeTag tag doc = } } -tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document -tagDocumentInteractively settings allDocs doc = do - suggestedTags <- suggestTags settings allDocs doc +tagDocumentInteractively :: S.Settings -> C.ProbabilityCache -> D.Document -> IO D.Document +tagDocumentInteractively settings probabilityCache doc = do + let suggestedTags = suggestTags settings probabilityCache doc tags <- mapM (uncurry tagDocumentInteractively') suggestedTags let doc' = (applyTags tags doc) - { index = doc.index {todo = False} + { D.index = doc.index {D.todo = False} } - withGit do - J.encodeFile doc.iFilePath doc'.index - commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath) - pure doc' + R.replaceDocument (printf "process %s (interactive)" doc.iFilePath) doc' + pure doc' where tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag) tagDocumentInteractively' tag tags @@ -770,30 +715,19 @@ consume1 language force keep filePath = do then ocr language filePath else pure content' addedAt <- getCurrentTime - withGit do - J.encodeFile - iFilePath - Index - { tags = S.empty, - todo = True, - modifiedAt = Nothing, - .. - } - if keep - then copyFile filePath oFilePath - else renameFile filePath oFilePath - commitAll - [iFilePath, oFilePath] - (printf "add %s" (takeFileName filePath)) - pure (takeBaseName iFilePath) - -withGit :: IO a -> IO a -withGit = withLockFile def ".gitlock" + let doc = + D.Document (fKey <.> "json") $ + D.Index + { tags = S.empty, + todo = True, + modifiedAt = Nothing, + .. + } + R.commitDocument (printf "add %s" (takeFileName filePath)) filePath doc + when (not keep) do + removeFile filePath -commitAll :: [FilePath] -> String -> IO () -commitAll fps m = do - sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps)) - sh_ (printf ">/dev/null git commit -m '%s' || :" m) + pure (takeBaseName iFilePath) data DecodeException = DecodeException FilePath String deriving (Show) @@ -839,49 +773,6 @@ ocr1 language tmp input = -- XXX `--oem 1` seems to be unavailable <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp input) language) -data Index = Index - { content :: T.Text, - tags :: S.Set G.Tag, - addedAt :: UTCTime, - modifiedAt :: Maybe UTCTime, - todo :: Bool, - language :: String - } - deriving (Show, Generic, Eq) - -instance J.ToJSON Index - -instance J.FromJSON Index - -instance HasField "shortText" Index T.Text where - getField = - (T.unlines . take 10 . T.lines) - . (.content) - -instance HasField "internalTags" Index (S.Set G.Tag) where - getField index = - index.tags - `S.union` internalTags index - --- TODO Cache `originalWords` --- --- @related cache-probabilitymap -instance HasField "originalWords" Index (S.Set T.Text) where - getField index = - S.fromList (T.words index.content) - -internalTags :: Index -> S.Set G.Tag -internalTags index = - S.fromList - ( concat - [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), - G.tag "language" (Just (T.pack index.language)) - ], - maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt), - if index.todo then [G.tag "todo" Nothing] else [] - ] - ) - data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) -- cgit v1.2.3