diff options
author | Alexander Foremny <aforemny@posteo.de> | 2024-01-11 03:20:56 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2024-01-11 03:26:52 +0100 |
commit | 7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (patch) | |
tree | 3ee48fc98f98ab7ac7ad19e24334e07b8b147dd6 | |
parent | 673c59d9be8b62106ffbba96d805680f0b5e7e3f (diff) |
chore: make computing `probabilityMap` more performance
-rw-r--r-- | apaperless.cabal | 14 | ||||
-rw-r--r-- | apaperless.yaml | 3 | ||||
-rw-r--r-- | app/Document.hs | 134 | ||||
-rw-r--r-- | app/Main.hs | 371 | ||||
-rw-r--r-- | app/ProbabilityMap.hs | 230 | ||||
-rw-r--r-- | app/Settings.hs | 25 | ||||
-rw-r--r-- | app/Store.hs | 60 | ||||
-rw-r--r-- | tags/src/Tag.hs | 4 | ||||
-rw-r--r-- | tags/tags.cabal | 3 |
9 files changed, 598 insertions, 246 deletions
diff --git a/apaperless.cabal b/apaperless.cabal index 480b605..1217b15 100644 --- a/apaperless.cabal +++ b/apaperless.cabal @@ -14,14 +14,17 @@ extra-doc-files: CHANGELOG.md -- extra-source-files: common warnings - ghc-options: -Wall -threaded + ghc-options: -Wall -threaded -with-rtsopts=-N -rtsopts executable apaperless import: warnings main-is: Main.hs other-modules: + Document + ProbabilityMap Prompt Settings + Store -- other-extensions: build-depends: base, @@ -43,6 +46,13 @@ executable apaperless time, regex, yaml, - xdg-basedir + xdg-basedir, + binary, + vector, + parallel, + deepseq, + unordered-containers, + vector-binary-instances, + split hs-source-dirs: app default-language: GHC2021 diff --git a/apaperless.yaml b/apaperless.yaml index ed74742..bbc9bec 100644 --- a/apaperless.yaml +++ b/apaperless.yaml @@ -2,7 +2,8 @@ defaultLanguage: deu+eng suggestedTags: - contents: - createdAt - - '[0-9]{2}\.[0-9]{2}\.[0-9]{4}' + - - ${d}([0-9]{2})\.${m}([0-9]{2})\.${y}([0-9]{4}) + - ${y}-${m}-${d} tag: SuggestTagByRE - contents: correspondent tag: SuggestTagByTags diff --git a/app/Document.hs b/app/Document.hs new file mode 100644 index 0000000..ac8a73b --- /dev/null +++ b/app/Document.hs @@ -0,0 +1,134 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE StrictData #-} + +module Document + ( Document (..), + Index (..), + hasWord, + hasTag, + hasWordAndTag, + tagValues, + getDocuments, + readDocument, + ) +where + +import Control.Concurrent.ParallelIO.Local (parallel, withPool) +import Control.Exception (Exception, throwIO) +import Data.Aeson qualified as J +import Data.ByteString.Lazy qualified as LB +import Data.List (sort) +import Data.Map qualified as M +import Data.Maybe (mapMaybe, maybeToList) +import Data.Set qualified as S +import Data.Text qualified as T +import Data.Time.Clock (UTCTime) +import Data.Time.Format.ISO8601 (iso8601Show) +import GHC.Conc (getNumProcessors) +import GHC.Generics (Generic) +import GHC.Records (HasField (..)) +import System.Directory (listDirectory) +import System.FilePath (takeBaseName, (<.>), (</>)) +import Tag qualified as G + +-- TODO Inline `Index` + +-- TODO Replace `iFileName` with `id` +data Document = Document + { iFileName :: String, + index :: Index + } + deriving (Show) + +instance HasField "oFilePath" Document FilePath where + getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf" + +instance HasField "iFilePath" Document FilePath where + getField doc = "index" </> doc.iFileName + +hasWord :: T.Text -> Document -> Bool +hasWord word doc = S.member word doc.index.originalWords + +hasTag :: G.Tag -> Document -> Bool +hasTag tag doc = S.member tag doc.index.tags + +hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool +hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc + +tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) +tagValues docs = + M.unionsWith S.union $ + mapMaybe + ( \tag -> + M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag) + ) + (S.toList (S.unions (map (.index.tags) docs))) + +getDocuments :: IO [Document] +getDocuments = + parMapM readDocument + =<< sort <$> listDirectory "index" + +readDocument :: FilePath -> IO Document +readDocument iFileName = + Document iFileName + <$> decodeFile @Index ("index" </> iFileName) + +data Index = Index + { content :: T.Text, + tags :: S.Set G.Tag, + addedAt :: UTCTime, + modifiedAt :: Maybe UTCTime, + todo :: Bool, + language :: String + } + deriving (Show, Generic, Eq) + +instance J.ToJSON Index + +instance J.FromJSON Index + +instance HasField "shortText" Index T.Text where + getField = + (T.unlines . take 10 . T.lines) + . (.content) + +instance HasField "internalTags" Index (S.Set G.Tag) where + getField index = + index.tags `S.union` internalTags index + +-- TODO Cache `originalWords` +-- +-- @related cache-probabilitymap +instance HasField "originalWords" Index (S.Set T.Text) where + getField index = + S.fromList (T.words index.content) + +internalTags :: Index -> S.Set G.Tag +internalTags index = + S.fromList + ( concat + [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), + G.tag "language" (Just (T.pack index.language)) + ], + maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt), + if index.todo then [G.tag "todo" Nothing] else [] + ] + ) + +parMapM :: (a -> IO b) -> [a] -> IO [b] +parMapM f xs = do + n <- getNumProcessors + withPool n $ \pool -> parallel pool (map f xs) + +data DecodeException = DecodeException FilePath String + deriving (Show) + +instance Exception DecodeException + +decodeFile :: J.FromJSON a => FilePath -> IO a +decodeFile fp = + either (throwIO . DecodeException fp) pure . J.eitherDecode + =<< LB.readFile fp diff --git a/app/Main.hs b/app/Main.hs index 93fed75..936f053 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiWayIf #-} @@ -16,41 +17,42 @@ module Main where import Control.Arrow (first, second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) -import Control.Monad (forM, unless, when) +import Control.Monad (unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB -import Data.Default import Data.Digest.Pure.SHA (sha1, showDigest) import Data.Either (partitionEithers) import Data.Function ((&)) +import Data.HashMap.Internal qualified as HM import Data.List import Data.List.NonEmpty qualified as N import Data.Map qualified as M -import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList) +import Data.Maybe (fromMaybe, mapMaybe) import Data.Ord (comparing) import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T -import Data.Time.Clock (UTCTime, getCurrentTime) -import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show) +import Data.Time.Clock (getCurrentTime) +import Data.Time.Format.ISO8601 (iso8601ParseM) import Debug.Trace +import Document qualified as D import GHC.Conc (getNumProcessors) -import GHC.Generics (Generic) -import GHC.Records (HasField (..)) import Options.Applicative qualified as O +import ProbabilityMap qualified as C import Prompt qualified as P import Settings qualified as S +import Store qualified as R import System.Directory import System.Environment (getEnv) import System.FilePath -import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Tag qualified as G import Text.Printf (printf) +import Text.RE.Replace qualified as R import Text.RE.TDFA.Text qualified as R import Text.Read (readMaybe) @@ -291,13 +293,13 @@ main = do keep ) (map (cwd </>) inputs) - docs <- mapM (readDocument . (<.> "json")) indexNames - allDocs <- getDocuments + docs <- mapM (D.readDocument . (<.> "json")) indexNames + probabilityCache <- C.readProbabilityCache docs' <- if - | auto -> processDocuments settings allDocs docs + | auto -> processDocuments settings probabilityCache docs -- TODO adding tags interactively through prompt does not persist them in store - | prompt -> processDocumentsInteractively settings allDocs docs + | prompt -> processDocumentsInteractively settings probabilityCache docs | otherwise -> pure docs mapM_ ( \doc -> do @@ -307,7 +309,7 @@ main = do docs' Args {cmd = Edit {indexNames}} -> do editDocuments - =<< mapM (readDocument . (<.> "json")) indexNames + =<< mapM (D.readDocument . (<.> "json")) indexNames Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do doRedoIf filters redo mapM_ @@ -316,24 +318,26 @@ main = do printTags doc ) . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments Args {cmd = List {filters, redo, edit = True}} -> do doRedoIf filters redo editDocuments . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments Args {cmd = List {filters, redo, todo = True}} -> do doRedoIf filters redo - allDocs <- getDocuments + allDocs <- D.getDocuments + probabilityCache <- C.readProbabilityCache _ <- - processDocumentsInteractively settings allDocs + processDocumentsInteractively settings probabilityCache . filter (G.applyFilters filters . (.index.tags)) $ allDocs pure () Args {cmd = Todo} -> do - allDocs <- getDocuments + allDocs <- D.getDocuments + probabilityCache <- C.readProbabilityCache _ <- - processDocumentsInteractively settings allDocs + processDocumentsInteractively settings probabilityCache . filter (G.applyFilters [G.filter G.include "todo" Nothing] . (.index.tags)) $ allDocs pure () @@ -341,12 +345,12 @@ main = do doRedoIf filters redo viewDocuments . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments Args {cmd = View {indexNames}} -> do viewDocuments - =<< mapM (readDocument . (<.> "json")) indexNames + =<< mapM (D.readDocument . (<.> "json")) indexNames Args {cmd = TopWords} -> do - allDocs <- getDocuments + probabilityCache <- C.readProbabilityCache mapM_ ( \(tag, xs) -> do print tag @@ -355,24 +359,23 @@ main = do $ map (second (sortBy (comparing (negate . snd)))) $ map (second (filter ((> 0) . snd))) $ M.toList - $ foldl - ( \wordProbabilityPerTag (word, tag, p) -> + $ M.foldlWithKey + ( \wordProbabilityPerTag (word, tag) p -> M.insertWith (++) tag [(word, p)] wordProbabilityPerTag ) M.empty - (probabilityMap allDocs) + probabilityCache.probabilityMap Args {cmd = Modify {indexNames, tags, untags}} -> do - docs <- mapM (readDocument . (<.> "json")) indexNames + docs <- mapM (D.readDocument . (<.> "json")) indexNames mapM_ - ( \doc -> do - let doc' = addTags tags (removeTags untags doc) - withGit do - J.encodeFile doc'.iFilePath doc'.index - commitAll [doc.iFilePath] (printf "tag %s" (takeBaseName doc.iFilePath)) + ( \doc -> + R.replaceDocument + (printf "tag %s" (takeBaseName doc.iFilePath)) + doc ) - docs + (map (addTags tags . removeTags untags) docs) -printTags :: Document -> IO () +printTags :: D.Document -> IO () printTags doc = mapM_ ( \tag -> @@ -380,76 +383,36 @@ printTags doc = Nothing -> printf "@%s\n" (G.tagKey tag) Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue ) - ( filter - ( \tag -> - -- TODO Handle tags hidden by default - -- - -- - Add `list-default-hide-tags` to `Settings` - -- - Add `--show-tags`, `--hide-tags` to `List` - not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag) - && (G.tagKey tag /= "content") - && (G.tagKey tag /= "importedFrom") - && (G.tagKey tag /= "language") - ) + ( filter arbitrarilySelectTag . S.toList $ doc.index.tags `S.union` doc.index.internalTags ) +arbitrarilySelectTag :: G.Tag -> Bool +arbitrarilySelectTag tag = + -- TODO Handle tags hidden by default + -- + -- - Add `list-default-hide-tags` to `Settings` + -- - Add `--show-tags`, `--hide-tags` to `List` + not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag) + && (G.tagKey tag /= "content") + && (G.tagKey tag /= "importedFrom") + && (G.tagKey tag /= "language") + doRedoIf :: [G.Filter] -> Bool -> IO () doRedoIf filters redo = when redo do parMapM_ doRedo . filter (G.applyFilters filters . (.index.tags)) - =<< getDocuments + =<< D.getDocuments where doRedo doc = do content <- ocr doc.index.language doc.oFilePath - withGit do - J.encodeFile doc.iFilePath doc.index {content = content} - commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) - -data Document = Document - { iFileName :: String, - index :: Index - } - deriving (Show) - -instance HasField "oFilePath" Document FilePath where - getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf" - -instance HasField "iFilePath" Document FilePath where - getField doc = "index" </> doc.iFileName - -hasWord :: T.Text -> Document -> Bool -hasWord word doc = S.member word doc.index.originalWords - -hasTag :: G.Tag -> Document -> Bool -hasTag tag doc = S.member tag doc.index.tags - -hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool -hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc - -tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) -tagValues docs = - M.unionsWith S.union $ - mapMaybe - ( \tag -> - M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag) - ) - (S.toList (S.unions (map (.index.tags) docs))) + let doc' = doc {D.index = doc.index {D.content = content}} + R.replaceDocument (printf "redo %s" (takeBaseName doc.iFilePath)) doc' -getDocuments :: IO [Document] -getDocuments = - parMapM readDocument - =<< sort <$> listDirectory "index" - -readDocument :: FilePath -> IO Document -readDocument iFileName = - Document iFileName - <$> decodeFile @Index ("index" </> iFileName) - -processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document] -processDocumentsInteractively settings allDocs docs = +processDocumentsInteractively :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document] +processDocumentsInteractively settings probabilityCache docs = do mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs) where numDocs = length docs @@ -478,13 +441,13 @@ processDocumentsInteractively settings allDocs docs = printf "%s\n" (takeBaseName doc.iFileName) printf "%s\n" doc.index.content processDocumentInteractively n doc - "p" -> tagDocumentInteractively settings allDocs doc + "p" -> tagDocumentInteractively settings probabilityCache doc "s" -> pure doc "v" -> do viewDocuments [doc] processDocumentInteractively n doc -viewDocuments :: [Document] -> IO () +viewDocuments :: [D.Document] -> IO () viewDocuments docs = sh_ ( "zathura " @@ -494,7 +457,7 @@ viewDocuments docs = ) ) -editDocuments :: [Document] -> IO () +editDocuments :: [D.Document] -> IO () editDocuments docs = withSystemTempDirectory "apaperless" $ \tmp -> do let fp doc = tmp </> takeBaseName doc.iFileName <.> "txt" @@ -513,63 +476,49 @@ editDocuments docs = mapM_ ( \doc -> do content <- T.readFile (fp doc) - withGit do - J.encodeFile doc.iFilePath doc.index {content = content} - commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath)) + let doc' = doc {D.index = doc.index {D.content = content}} + R.replaceDocument (printf "edit %s" (takeBaseName doc.iFilePath)) doc' pure content ) docs -type ProbabilityMap = [(T.Text, G.Tag, Double)] - --- TODO Consider words that contribute to NOT adding a tag --- --- If there is a document that should, say, not have a `@correspondent`, we should score words that contribute to that fact as well. --- --- @topic probability-map -probabilityMap :: [Document] -> ProbabilityMap -probabilityMap allDocs = - [ let docsWithWord = filter (hasWord word) allDocs - docsWithTag = filter (hasTag tag) allDocs - docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs - p = - fi (length docsWithWordAndTag) - / fi - ( length docsWithWord - + length docsWithTag - - length docsWithWordAndTag - ) - in (word, tag, p) - | word <- S.toList allWords, - tag <- S.toList allTags - ] - where - allTags = foldl S.union S.empty (map (.index.tags) allDocs) - allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) - fi = fromIntegral @Int @Double - -suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])] -suggestTags settings allDocs doc = do - forM settings.suggestedTags $ \suggestedTag -> do +suggestTags :: S.Settings -> C.ProbabilityCache -> D.Document -> [(G.Tag, [G.Tag])] +suggestTags settings probabilityCache doc = do + flip map settings.suggestedTags $ \suggestedTag -> case suggestedTag of - S.SuggestTagByRE tagName pattern -> do + S.SuggestTagByRE tagName pattern -> let tagValues = - nub . catMaybes . map R.matchedText . R.allMatches $ - doc.index.content - R.*=~ pattern - pure (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues) - S.SuggestTagByTags tagName -> do + nub + . mapMaybe + ( -- XXX Whys is this so complicated? + \match -> do + captures <- snd <$> R.matchCaptures match + pure + ( foldl + ( \template (captureName, captureOrdinal) -> + T.replace + ("${" <> R.getCaptureName captureName <> "}") + (R.capturedText (captures !! (fromEnum captureOrdinal - 1))) + template + ) + (R.getTemplate pattern) + (HM.toList $ R.captureNames match) + ) + ) + $ R.allMatches (doc.index.content R.*=~ (R.getSearch pattern)) + in (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues) + S.SuggestTagByTags tagName -> let tagValues = - -- TODO Cache `probabilityMap` + -- TODO Cache `probabilityCache` -- -- @topic probability-map - probabilityMap allDocs - & filter - ( \(word, tag, _) -> - G.tagKey tag == tagName && hasWord word doc + probabilityCache.probabilityMap + & M.filterWithKey + ( \(word, tag) _ -> + G.tagKey tag == tagName && D.hasWord word doc ) - & foldl' - ( \scorePerTagValue (_, tag, p) -> + & M.foldlWithKey' + ( \scorePerTagValue (_, tag) p -> M.insertWith (+) (G.tagValue tag) @@ -580,7 +529,7 @@ suggestTags settings allDocs doc = do & M.toList & sortBy (comparing (negate . snd)) & map fst - pure (G.tag tagName (Just ""), map (G.tag tagName) tagValues) + in (G.tag tagName (Just ""), map (G.tag tagName) tagValues) autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag] autoApplySuggestedTags = @@ -591,28 +540,26 @@ autoApplySuggestedTags = else Just (head suggestedTags) ) -processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document] -processDocuments settings allDocs docs = +processDocuments :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document] +processDocuments settings probabilityCache docs = mapM processDocument docs where processDocument doc = do - tags <- - S.fromList . autoApplySuggestedTags - <$> suggestTags settings allDocs doc + let tags = + S.fromList . autoApplySuggestedTags $ + suggestTags settings probabilityCache doc let doc' = doc - { index = + { D.index = doc.index - { tags = doc.index.tags `S.union` tags, - todo = False + { D.tags = doc.index.tags `S.union` tags, + D.todo = False } } - withGit do - J.encodeFile doc.iFilePath doc'.index - commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath) + R.replaceDocument (printf "process %s (auto)" doc.iFilePath) doc' pure doc' -applyTags :: [Either T.Text G.Tag] -> Document -> Document +applyTags :: [Either T.Text G.Tag] -> D.Document -> D.Document applyTags tags' doc = do addTags tags (removeTags untags doc) where @@ -625,43 +572,43 @@ data AddTagException = AddTagException G.Tag instance Exception AddTagException -addTags :: [G.Tag] -> Document -> Document +addTags :: [G.Tag] -> D.Document -> D.Document addTags = flip (foldl (flip addTag)) -addTag :: G.Tag -> Document -> Document +addTag :: G.Tag -> D.Document -> D.Document addTag tag doc = if | G.tagKey tag == "todo" -> - doc {index = doc.index {todo = True}} + doc {D.index = doc.index {D.todo = True}} | G.tagKey tag == "addedAt" -> maybe (throw (AddTagException tag)) ( \addedAt -> - doc {index = doc.index {addedAt = addedAt}} + doc {D.index = doc.index {D.addedAt = addedAt}} ) (iso8601ParseM . T.unpack =<< (G.tagValue tag)) | G.tagKey tag == "modifiedAt" -> maybe (throw (AddTagException tag)) ( \modifiedAt -> - doc {index = doc.index {modifiedAt = Just modifiedAt}} + doc {D.index = doc.index {D.modifiedAt = Just modifiedAt}} ) (iso8601ParseM . T.unpack =<< (G.tagValue tag)) | G.tagKey tag == "content" -> maybe (throw (AddTagException tag)) ( \content -> - doc {index = doc.index {content = content}} + doc {D.index = doc.index {D.content = content}} ) (G.tagValue tag) | G.tagKey tag == "language" -> throw (AddTagException tag) | otherwise -> doc - { index = + { D.index = doc.index - { tags = G.replace tag doc.index.tags + { D.tags = G.replace tag doc.index.tags } } @@ -670,15 +617,15 @@ data RemoveTagException = RemoveTagException G.Tag instance Exception RemoveTagException -removeTags :: [G.Tag] -> Document -> Document +removeTags :: [G.Tag] -> D.Document -> D.Document removeTags = flip (foldl (flip removeTag)) -removeTag :: G.Tag -> Document -> Document +removeTag :: G.Tag -> D.Document -> D.Document removeTag tag doc = if | G.tagKey tag == "todo" -> - doc {index = doc.index {todo = False}} + doc {D.index = doc.index {D.todo = False}} | G.tagKey tag == "addedAt" -> throw (RemoveTagException tag) | G.tagKey tag == "modifiedAt" -> @@ -689,9 +636,9 @@ removeTag tag doc = throw (RemoveTagException tag) | otherwise -> doc - { index = + { D.index = doc.index - { tags = + { D.tags = maybe (G.deleteAll (G.tagKey tag)) (\_ -> G.delete tag) @@ -700,18 +647,16 @@ removeTag tag doc = } } -tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document -tagDocumentInteractively settings allDocs doc = do - suggestedTags <- suggestTags settings allDocs doc +tagDocumentInteractively :: S.Settings -> C.ProbabilityCache -> D.Document -> IO D.Document +tagDocumentInteractively settings probabilityCache doc = do + let suggestedTags = suggestTags settings probabilityCache doc tags <- mapM (uncurry tagDocumentInteractively') suggestedTags let doc' = (applyTags tags doc) - { index = doc.index {todo = False} + { D.index = doc.index {D.todo = False} } - withGit do - J.encodeFile doc.iFilePath doc'.index - commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath) - pure doc' + R.replaceDocument (printf "process %s (interactive)" doc.iFilePath) doc' + pure doc' where tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag) tagDocumentInteractively' tag tags @@ -770,30 +715,19 @@ consume1 language force keep filePath = do then ocr language filePath else pure content' addedAt <- getCurrentTime - withGit do - J.encodeFile - iFilePath - Index - { tags = S.empty, - todo = True, - modifiedAt = Nothing, - .. - } - if keep - then copyFile filePath oFilePath - else renameFile filePath oFilePath - commitAll - [iFilePath, oFilePath] - (printf "add %s" (takeFileName filePath)) - pure (takeBaseName iFilePath) - -withGit :: IO a -> IO a -withGit = withLockFile def ".gitlock" + let doc = + D.Document (fKey <.> "json") $ + D.Index + { tags = S.empty, + todo = True, + modifiedAt = Nothing, + .. + } + R.commitDocument (printf "add %s" (takeFileName filePath)) filePath doc + when (not keep) do + removeFile filePath -commitAll :: [FilePath] -> String -> IO () -commitAll fps m = do - sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps)) - sh_ (printf ">/dev/null git commit -m '%s' || :" m) + pure (takeBaseName iFilePath) data DecodeException = DecodeException FilePath String deriving (Show) @@ -839,49 +773,6 @@ ocr1 language tmp input = -- XXX `--oem 1` seems to be unavailable <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language) -data Index = Index - { content :: T.Text, - tags :: S.Set G.Tag, - addedAt :: UTCTime, - modifiedAt :: Maybe UTCTime, - todo :: Bool, - language :: String - } - deriving (Show, Generic, Eq) - -instance J.ToJSON Index - -instance J.FromJSON Index - -instance HasField "shortText" Index T.Text where - getField = - (T.unlines . take 10 . T.lines) - . (.content) - -instance HasField "internalTags" Index (S.Set G.Tag) where - getField index = - index.tags - `S.union` internalTags index - --- TODO Cache `originalWords` --- --- @related cache-probabilitymap -instance HasField "originalWords" Index (S.Set T.Text) where - getField index = - S.fromList (T.words index.content) - -internalTags :: Index -> S.Set G.Tag -internalTags index = - S.fromList - ( concat - [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), - G.tag "language" (Just (T.pack index.language)) - ], - maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt), - if index.todo then [G.tag "todo" Nothing] else [] - ] - ) - data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs new file mode 100644 index 0000000..9a700d7 --- /dev/null +++ b/app/ProbabilityMap.hs @@ -0,0 +1,230 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DuplicateRecordFields #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE StrictData #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE NoFieldSelectors #-} +{-# OPTIONS_GHC -fno-warn-name-shadowing #-} + +module ProbabilityMap + ( fileName, + ProbabilityCache (..), + readProbabilityCache, + writeProbabilityCache, + modifyProbabilityCache, + fromDocuments, + addDocument, + deleteDocument, + replaceDocument, + ) +where + +import Control.Applicative (liftA2) +import Control.Arrow ((***)) +import Control.DeepSeq (NFData (..)) +import Control.Exception (SomeException, try) +import Control.Parallel.Strategies +import Data.Binary qualified as B +import Data.List (foldl') +import Data.Map qualified as M +import Data.Maybe (fromMaybe) +import Data.Set qualified as S +import Data.Text qualified as T +import Data.Vector qualified as V +import Data.Vector.Binary () +import Document qualified as D +import GHC.Conc (numCapabilities) +import GHC.Generics (Generic) +import Tag qualified as G + +data ProbabilityCache = ProbabilityCache + { docsPerWord :: M.Map T.Text Int, + docsPerTag :: M.Map G.Tag Int, + docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int, + probabilityMap :: M.Map (T.Text, G.Tag) Double + } + deriving (Show, Generic) + +data ProbabilityCache' = ProbabilityCache' + { docsPerWord :: M.Map T.Text Int, + docsPerTag :: M.Map G.Tag Int, + docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int + } + deriving (Show, Generic) + +instance NFData ProbabilityCache' where + rnf (ProbabilityCache' {..}) = + rnf docsPerWord `seq` + rnf docsPerTag `seq` + rnf docsPerWordAndTag + +fromProbabilityCache' :: ProbabilityCache' -> ProbabilityCache +fromProbabilityCache' probabilityCache'@(ProbabilityCache' {..}) = + ProbabilityCache {probabilityMap = probabilityMap probabilityCache', ..} + +toProbabilityCache' :: ProbabilityCache -> ProbabilityCache' +toProbabilityCache' (ProbabilityCache {..}) = + ProbabilityCache' {..} + +instance B.Binary ProbabilityCache where + get = fromCacheRep <$> B.get + put = B.put . toCacheRep + +instance NFData ProbabilityCache where + rnf (ProbabilityCache {..}) = + rnf docsPerWord `seq` + rnf docsPerTag `seq` + rnf docsPerWordAndTag `seq` + rnf probabilityMap + +data CacheRep = CacheRep + { allWords :: V.Vector T.Text, + allTags :: V.Vector G.Tag, + docsPerWord :: M.Map Int Int, + docsPerTag :: M.Map Int Int, + docsPerWordAndTag :: M.Map (Int, Int) Int, + probabilityMap :: M.Map (Int, Int) Double + } + deriving (Show, Generic, B.Binary) + +instance NFData CacheRep where + rnf (CacheRep {..}) = + rnf allWords `seq` + rnf allTags `seq` + rnf docsPerWord `seq` + rnf docsPerTag `seq` + rnf docsPerWordAndTag `seq` + rnf probabilityMap + +fromCacheRep :: CacheRep -> ProbabilityCache +fromCacheRep (CacheRep {..}) = + ProbabilityCache + { docsPerWord = M.mapKeys fromWord docsPerWord, + docsPerTag = M.mapKeys fromTag docsPerTag, + docsPerWordAndTag = M.mapKeys fromWordAndTag docsPerWordAndTag, + probabilityMap = M.mapKeys fromWordAndTag probabilityMap + } + where + fromWord = (allWords V.!) + fromTag = (allTags V.!) + fromWordAndTag = fromWord *** fromTag + +toCacheRep :: ProbabilityCache -> CacheRep +toCacheRep (ProbabilityCache {..}) = + CacheRep + { allWords = V.fromList allWords', + allTags = V.fromList allTags', + docsPerWord = M.mapKeys toWord docsPerWord, + docsPerTag = M.mapKeys toTag docsPerTag, + docsPerWordAndTag = M.mapKeys toWordAndTag docsPerWordAndTag, + probabilityMap = M.mapKeys toWordAndTag probabilityMap + } + where + allWords' = M.keys docsPerWord + allTags' = M.keys docsPerTag + allWordsMap = M.fromList $ zip allWords' [0 ..] + allTagsMap = M.fromList $ zip allTags' [0 ..] + toWord = (allWordsMap M.!) + toTag = (allTagsMap M.!) + toWordAndTag = toWord *** toTag + +fileName :: FilePath +fileName = "probabilityCache" + +readProbabilityCache :: IO ProbabilityCache +readProbabilityCache = + try (B.decodeFile fileName) >>= \case + Left (_ :: SomeException) -> fromDocuments <$> D.getDocuments + Right probabilityCache -> pure probabilityCache + +writeProbabilityCache :: ProbabilityCache -> IO () +writeProbabilityCache probabilityCache = + B.encodeFile fileName probabilityCache + +modifyProbabilityCache :: (ProbabilityCache -> ProbabilityCache) -> IO ProbabilityCache +modifyProbabilityCache f = + liftA2 (>>) (writeProbabilityCache . f) pure =<< readProbabilityCache + +fromDocuments :: [D.Document] -> ProbabilityCache +fromDocuments allDocs = + fromProbabilityCache' $ + foldl' (flip addDocument') (ProbabilityCache' M.empty M.empty M.empty) allDocs + +probabilityMap :: ProbabilityCache' -> M.Map (T.Text, G.Tag) Double +probabilityMap (ProbabilityCache' {..}) = + M.unions + . withStrategy (parList rdeepseq) + . map + ( M.fromList + . filter (\(_, p) -> p > 0) + . map (\k@(word, tag) -> (k, probability word tag)) + ) + $ streamsOf + numCapabilities + [ (word, tag) + | word <- allWords, + tag <- allTags + ] + where + probability word tag = + let docsWithWord = docsPerWord M.! word + docsWithTag = docsPerTag M.! tag + docsWithWordAndTag = fromMaybe 0 $ M.lookup (word, tag) docsPerWordAndTag + in fi docsWithWordAndTag / fi (docsWithWord + docsWithTag - docsWithWordAndTag) + + allWords = M.keys docsPerWord + allTags = M.keys docsPerTag + fi = fromIntegral @Int @Double + +streamsOf :: Int -> [a] -> [[a]] +streamsOf 1 xs = [xs] +streamsOf n xs + | n > 0 = [everyN k xs | k <- [0 .. n - 1]] + | otherwise = [] + where + everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs + +addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache +addDocument doc probabilityCache = + fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache) + +deleteDocument :: D.Document -> ProbabilityCache -> ProbabilityCache +deleteDocument doc probabilityCache = + fromProbabilityCache' $ deleteDocument' doc (toProbabilityCache' probabilityCache) + +replaceDocument :: D.Document -> D.Document -> ProbabilityCache -> ProbabilityCache +replaceDocument oldDoc newDoc probabilityCache = + fromProbabilityCache' $ + (addDocument' newDoc . deleteDocument' oldDoc) (toProbabilityCache' probabilityCache) + +addDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache' +addDocument' doc probabilityCache = + combining (+) probabilityCache (fromDocument doc) + +deleteDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache' +deleteDocument' doc probabilityCache = + combining (-) probabilityCache (fromDocument doc) + +combining :: (Int -> Int -> Int) -> ProbabilityCache' -> ProbabilityCache' -> ProbabilityCache' +combining f probabilityCache probabilityCache' = + ProbabilityCache' {..} + where + docsPerWord = M.unionWith f probabilityCache.docsPerWord probabilityCache'.docsPerWord + docsPerTag = M.unionWith f probabilityCache.docsPerTag probabilityCache'.docsPerTag + docsPerWordAndTag = M.unionWith f probabilityCache.docsPerWordAndTag probabilityCache'.docsPerWordAndTag + +fromDocument :: D.Document -> ProbabilityCache' +fromDocument doc = + let words = S.toList doc.index.originalWords + tags = S.toList doc.index.tags + wordAndTags = [(word, tag) | word <- words, tag <- tags] + in ProbabilityCache' + { docsPerWord = docsPer words, + docsPerTag = docsPer tags, + docsPerWordAndTag = docsPer wordAndTags + } + where + docsPer :: Ord k => [k] -> M.Map k Int + docsPer = M.fromList . map (flip (,) 1) diff --git a/app/Settings.hs b/app/Settings.hs index 08b40bb..43f721c 100644 --- a/app/Settings.hs +++ b/app/Settings.hs @@ -16,6 +16,7 @@ import Data.Aeson qualified as A import Data.Function (on) import Data.List (nub) import Data.Text qualified as T +import Data.Vector qualified as V import Data.Yaml (decodeFileThrow, encodeFile) import GHC.Generics (Generic) import System.Directory (doesFileExist) @@ -47,7 +48,9 @@ instance Monoid Settings where -- - perform minor corrections on OCR, ie. parse `0 1.01.1970` -- -- @topic suggested-tags - SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|], + SuggestTagByRE + "createdAt" + [R.ed|${d}([0-9]{2})\.${m}([0-9]{2})\.${y}([0-9]{4})///${y}-${m}-${d}|], SuggestTagByTags "correspondent" ], defaultLanguage = "deu+eng" @@ -58,7 +61,7 @@ instance A.FromJSON Settings instance A.ToJSON Settings data SuggestedTag - = SuggestTagByRE T.Text R.RE + = SuggestTagByRE T.Text (R.SearchReplace R.RE T.Text) | SuggestTagByTags T.Text deriving (Show, Generic, Eq) @@ -68,6 +71,10 @@ instance Show R.RE where instance Eq R.RE where (==) = (==) `on` show +instance (Eq re, Eq s) => Eq (R.SearchReplace re s) where + a == b = + ((==) `on` R.getSearch) a b && (((==) `on` R.getTemplate) a b) + instance A.FromJSON SuggestedTag instance A.ToJSON SuggestedTag @@ -78,6 +85,20 @@ instance A.FromJSON R.RE where instance A.ToJSON R.RE where toJSON = A.toJSON . R.reSource +instance (A.FromJSON re, A.FromJSON s) => A.FromJSON (R.SearchReplace re s) where + parseJSON = + A.withArray + "regular search/replace expression (POSIX)" + ( \a -> do + re <- a `V.indexM` 0 + s <- a `V.indexM` 1 + R.SearchReplace <$> A.parseJSON re <*> A.parseJSON s + ) + +instance (A.ToJSON re, A.ToJSON s) => A.ToJSON (R.SearchReplace re s) where + toJSON (R.SearchReplace re s) = + A.Array (V.fromList [A.toJSON re, A.toJSON s]) + readSettings :: IO Settings readSettings = fmap mconcat diff --git a/app/Store.hs b/app/Store.hs new file mode 100644 index 0000000..7a4637f --- /dev/null +++ b/app/Store.hs @@ -0,0 +1,60 @@ +{-# LANGUAGE BlockArguments #-} +{-# LANGUAGE OverloadedRecordDot #-} + +module Store + ( commitDocument, + replaceDocument, + ) +where + +import Control.Exception (Exception, throwIO) +import Data.Aeson qualified as J +import Data.ByteString.Lazy qualified as LB +import Data.Default (Default (def)) +import Data.List (intercalate) +import Data.String (IsString (fromString)) +import Document qualified as D +import ProbabilityMap qualified as C +import System.Directory (copyFile) +import System.Exit (ExitCode (ExitFailure, ExitSuccess)) +import System.IO.LockFile (withLockFile) +import System.Process.Typed (readProcessStderr) +import Text.Printf (printf) + +replaceDocument :: String -> D.Document -> IO () +replaceDocument message doc = do + withGit do + J.encodeFile doc.iFilePath doc.index + oldDoc <- D.readDocument doc.iFileName + _ <- C.modifyProbabilityCache (C.replaceDocument oldDoc doc) + commitAll [doc.iFilePath, C.fileName] message + +commitDocument :: String -> FilePath -> D.Document -> IO () +commitDocument message original doc = do + withGit do + copyFile original doc.oFilePath + J.encodeFile doc.iFilePath doc.index + _ <- C.modifyProbabilityCache (C.addDocument doc) + commitAll [doc.iFilePath, doc.oFilePath, C.fileName] message + +withGit :: IO a -> IO a +withGit = withLockFile def ".gitlock" + +commitAll :: [FilePath] -> String -> IO () +commitAll fps m = do + sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps)) + sh_ (printf ">/dev/null git commit -m '%s' || :" m) + +-- TODO Refacotor library `sh` +sh_ :: String -> IO () +sh_ cmd = do + -- printf "+ %s\n" cmd + (exitCode, err) <- readProcessStderr (fromString cmd) + case exitCode of + ExitSuccess -> return () + ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err + +data ProcessException = ProcessException Int LB.ByteString + deriving (Show) + +instance Exception ProcessException diff --git a/tags/src/Tag.hs b/tags/src/Tag.hs index f7f3398..882f1de 100644 --- a/tags/src/Tag.hs +++ b/tags/src/Tag.hs @@ -51,6 +51,7 @@ module Tag where import Control.Applicative ((<|>)) +import Control.DeepSeq (NFData (rnf)) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.Binary (Binary) @@ -66,6 +67,9 @@ import TypedValue (cast, castDef) data Tag = Tag T.Text (Maybe T.Text) deriving (Show, Generic, Binary, Eq, Ord) +instance NFData Tag where + rnf (Tag k v) = rnf k `seq` rnf v + tag :: T.Text -> Maybe T.Text -> Tag tag = Tag diff --git a/tags/tags.cabal b/tags/tags.cabal index 0149e74..c93aedd 100644 --- a/tags/tags.cabal +++ b/tags/tags.cabal @@ -26,7 +26,8 @@ library containers, regex, text, - time + time, + deepseq hs-source-dirs: src default-language: GHC2021 default-extensions: |