chore: make computing `probabilityMap` more performance

author: Alexander Foremny <aforemny@posteo.de> 2024-01-11 03:20:56 +0100
committer: Alexander Foremny <aforemny@posteo.de> 2024-01-11 03:26:52 +0100
commit: 7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (patch)
tree: 3ee48fc98f98ab7ac7ad19e24334e07b8b147dd6
parent: 673c59d9be8b62106ffbba96d805680f0b5e7e3f (diff)
9 files changed, 598 insertions, 246 deletions
diff --git a/apaperless.cabal b/apaperless.cabal
index 480b605..1217b15 100644
--- a/apaperless.cabal
+++ b/apaperless.cabal
@@ -14,14 +14,17 @@ extra-doc-files:    CHANGELOG.md
 -- extra-source-files:
 
 common warnings
-    ghc-options: -Wall -threaded
+    ghc-options: -Wall -threaded -with-rtsopts=-N -rtsopts
 
 executable apaperless
     import:           warnings
     main-is:          Main.hs
     other-modules:
+      Document
+      ProbabilityMap
       Prompt
       Settings
+      Store
     -- other-extensions:
     build-depends:
       base,
@@ -43,6 +46,13 @@ executable apaperless
       time,
       regex,
       yaml,
-      xdg-basedir
+      xdg-basedir,
+      binary,
+      vector,
+      parallel,
+      deepseq,
+      unordered-containers,
+      vector-binary-instances,
+      split
     hs-source-dirs:   app
     default-language: GHC2021
diff --git a/apaperless.yaml b/apaperless.yaml
index ed74742..bbc9bec 100644
--- a/apaperless.yaml
+++ b/apaperless.yaml
@@ -2,7 +2,8 @@ defaultLanguage: deu+eng
 suggestedTags:
 - contents:
   - createdAt
-  - '[0-9]{2}\.[0-9]{2}\.[0-9]{4}'
+  - - ${d}([0-9]{2})\.${m}([0-9]{2})\.${y}([0-9]{4})
+    - ${y}-${m}-${d}
   tag: SuggestTagByRE
 - contents: correspondent
   tag: SuggestTagByTags
diff --git a/app/Document.hs b/app/Document.hs
new file mode 100644
index 0000000..ac8a73b
--- /dev/null
+++ b/app/Document.hs
@@ -0,0 +1,134 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StrictData #-}
+
+module Document
+  ( Document (..),
+    Index (..),
+    hasWord,
+    hasTag,
+    hasWordAndTag,
+    tagValues,
+    getDocuments,
+    readDocument,
+  )
+where
+
+import Control.Concurrent.ParallelIO.Local (parallel, withPool)
+import Control.Exception (Exception, throwIO)
+import Data.Aeson qualified as J
+import Data.ByteString.Lazy qualified as LB
+import Data.List (sort)
+import Data.Map qualified as M
+import Data.Maybe (mapMaybe, maybeToList)
+import Data.Set qualified as S
+import Data.Text qualified as T
+import Data.Time.Clock (UTCTime)
+import Data.Time.Format.ISO8601 (iso8601Show)
+import GHC.Conc (getNumProcessors)
+import GHC.Generics (Generic)
+import GHC.Records (HasField (..))
+import System.Directory (listDirectory)
+import System.FilePath (takeBaseName, (<.>), (</>))
+import Tag qualified as G
+
+-- TODO Inline `Index`
+
+-- TODO Replace `iFileName` with `id`
+data Document = Document
+  { iFileName :: String,
+    index :: Index
+  }
+  deriving (Show)
+
+instance HasField "oFilePath" Document FilePath where
+  getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
+
+instance HasField "iFilePath" Document FilePath where
+  getField doc = "index" </> doc.iFileName
+
+hasWord :: T.Text -> Document -> Bool
+hasWord word doc = S.member word doc.index.originalWords
+
+hasTag :: G.Tag -> Document -> Bool
+hasTag tag doc = S.member tag doc.index.tags
+
+hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
+hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+
+tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
+tagValues docs =
+  M.unionsWith S.union $
+    mapMaybe
+      ( \tag ->
+          M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
+      )
+      (S.toList (S.unions (map (.index.tags) docs)))
+
+getDocuments :: IO [Document]
+getDocuments =
+  parMapM readDocument
+    =<< sort <$> listDirectory "index"
+
+readDocument :: FilePath -> IO Document
+readDocument iFileName =
+  Document iFileName
+    <$> decodeFile @Index ("index" </> iFileName)
+
+data Index = Index
+  { content :: T.Text,
+    tags :: S.Set G.Tag,
+    addedAt :: UTCTime,
+    modifiedAt :: Maybe UTCTime,
+    todo :: Bool,
+    language :: String
+  }
+  deriving (Show, Generic, Eq)
+
+instance J.ToJSON Index
+
+instance J.FromJSON Index
+
+instance HasField "shortText" Index T.Text where
+  getField =
+    (T.unlines . take 10 . T.lines)
+      . (.content)
+
+instance HasField "internalTags" Index (S.Set G.Tag) where
+  getField index =
+    index.tags `S.union` internalTags index
+
+-- TODO Cache `originalWords`
+--
+-- @related cache-probabilitymap
+instance HasField "originalWords" Index (S.Set T.Text) where
+  getField index =
+    S.fromList (T.words index.content)
+
+internalTags :: Index -> S.Set G.Tag
+internalTags index =
+  S.fromList
+    ( concat
+        [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
+            G.tag "language" (Just (T.pack index.language))
+          ],
+          maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
+          if index.todo then [G.tag "todo" Nothing] else []
+        ]
+    )
+
+parMapM :: (a -> IO b) -> [a] -> IO [b]
+parMapM f xs = do
+  n <- getNumProcessors
+  withPool n $ \pool -> parallel pool (map f xs)
+
+data DecodeException = DecodeException FilePath String
+  deriving (Show)
+
+instance Exception DecodeException
+
+decodeFile :: J.FromJSON a => FilePath -> IO a
+decodeFile fp =
+  either (throwIO . DecodeException fp) pure . J.eitherDecode
+    =<< LB.readFile fp
diff --git a/app/Main.hs b/app/Main.hs
index 93fed75..936f053 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE BlockArguments #-}
 {-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveAnyClass #-}
 {-# LANGUAGE DuplicateRecordFields #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiWayIf #-}
@@ -16,41 +17,42 @@ module Main where
 import Control.Arrow (first, second)
 import Control.Concurrent.ParallelIO.Local (parallel, withPool)
 import Control.Exception (Exception, throw, throwIO)
-import Control.Monad (forM, unless, when)
+import Control.Monad (unless, when)
 import Data.Aeson qualified as J
 import Data.Attoparsec.Text qualified as A
 import Data.ByteString.Lazy qualified as LB
-import Data.Default
 import Data.Digest.Pure.SHA (sha1, showDigest)
 import Data.Either (partitionEithers)
 import Data.Function ((&))
+import Data.HashMap.Internal qualified as HM
 import Data.List
 import Data.List.NonEmpty qualified as N
 import Data.Map qualified as M
-import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList)
+import Data.Maybe (fromMaybe, mapMaybe)
 import Data.Ord (comparing)
 import Data.Set qualified as S
 import Data.String (IsString (fromString))
 import Data.Text qualified as T
 import Data.Text.Encoding qualified as T
 import Data.Text.IO qualified as T
-import Data.Time.Clock (UTCTime, getCurrentTime)
-import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show)
+import Data.Time.Clock (getCurrentTime)
+import Data.Time.Format.ISO8601 (iso8601ParseM)
 import Debug.Trace
+import Document qualified as D
 import GHC.Conc (getNumProcessors)
-import GHC.Generics (Generic)
-import GHC.Records (HasField (..))
 import Options.Applicative qualified as O
+import ProbabilityMap qualified as C
 import Prompt qualified as P
 import Settings qualified as S
+import Store qualified as R
 import System.Directory
 import System.Environment (getEnv)
 import System.FilePath
-import System.IO.LockFile (withLockFile)
 import System.IO.Temp (withSystemTempDirectory)
 import System.Process.Typed
 import Tag qualified as G
 import Text.Printf (printf)
+import Text.RE.Replace qualified as R
 import Text.RE.TDFA.Text qualified as R
 import Text.Read (readMaybe)
 
@@ -291,13 +293,13 @@ main = do
               keep
           )
           (map (cwd </>) inputs)
-      docs <- mapM (readDocument . (<.> "json")) indexNames
-      allDocs <- getDocuments
+      docs <- mapM (D.readDocument . (<.> "json")) indexNames
+      probabilityCache <- C.readProbabilityCache
       docs' <-
         if
-            | auto -> processDocuments settings allDocs docs
+            | auto -> processDocuments settings probabilityCache docs
             -- TODO adding tags interactively through prompt does not persist them in store
-            | prompt -> processDocumentsInteractively settings allDocs docs
+            | prompt -> processDocumentsInteractively settings probabilityCache docs
             | otherwise -> pure docs
       mapM_
         ( \doc -> do
@@ -307,7 +309,7 @@ main = do
         docs'
     Args {cmd = Edit {indexNames}} -> do
       editDocuments
-        =<< mapM (readDocument . (<.> "json")) indexNames
+        =<< mapM (D.readDocument . (<.> "json")) indexNames
     Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do
       doRedoIf filters redo
       mapM_
@@ -316,24 +318,26 @@ main = do
             printTags doc
         )
         . filter (G.applyFilters filters . (.index.tags))
-        =<< getDocuments
+        =<< D.getDocuments
     Args {cmd = List {filters, redo, edit = True}} -> do
       doRedoIf filters redo
       editDocuments
         . filter (G.applyFilters filters . (.index.tags))
-        =<< getDocuments
+        =<< D.getDocuments
     Args {cmd = List {filters, redo, todo = True}} -> do
       doRedoIf filters redo
-      allDocs <- getDocuments
+      allDocs <- D.getDocuments
+      probabilityCache <- C.readProbabilityCache
       _ <-
-        processDocumentsInteractively settings allDocs
+        processDocumentsInteractively settings probabilityCache
           . filter (G.applyFilters filters . (.index.tags))
           $ allDocs
       pure ()
     Args {cmd = Todo} -> do
-      allDocs <- getDocuments
+      allDocs <- D.getDocuments
+      probabilityCache <- C.readProbabilityCache
       _ <-
-        processDocumentsInteractively settings allDocs
+        processDocumentsInteractively settings probabilityCache
           . filter (G.applyFilters [G.filter G.include "todo" Nothing] . (.index.tags))
           $ allDocs
       pure ()
@@ -341,12 +345,12 @@ main = do
       doRedoIf filters redo
       viewDocuments
         . filter (G.applyFilters filters . (.index.tags))
-        =<< getDocuments
+        =<< D.getDocuments
     Args {cmd = View {indexNames}} -> do
       viewDocuments
-        =<< mapM (readDocument . (<.> "json")) indexNames
+        =<< mapM (D.readDocument . (<.> "json")) indexNames
     Args {cmd = TopWords} -> do
-      allDocs <- getDocuments
+      probabilityCache <- C.readProbabilityCache
       mapM_
         ( \(tag, xs) -> do
             print tag
@@ -355,24 +359,23 @@ main = do
         $ map (second (sortBy (comparing (negate . snd))))
         $ map (second (filter ((> 0) . snd)))
         $ M.toList
-        $ foldl
-          ( \wordProbabilityPerTag (word, tag, p) ->
+        $ M.foldlWithKey
+          ( \wordProbabilityPerTag (word, tag) p ->
               M.insertWith (++) tag [(word, p)] wordProbabilityPerTag
           )
           M.empty
-          (probabilityMap allDocs)
+          probabilityCache.probabilityMap
     Args {cmd = Modify {indexNames, tags, untags}} -> do
-      docs <- mapM (readDocument . (<.> "json")) indexNames
+      docs <- mapM (D.readDocument . (<.> "json")) indexNames
       mapM_
-        ( \doc -> do
-            let doc' = addTags tags (removeTags untags doc)
-            withGit do
-              J.encodeFile doc'.iFilePath doc'.index
-              commitAll [doc.iFilePath] (printf "tag %s" (takeBaseName doc.iFilePath))
+        ( \doc ->
+            R.replaceDocument
+              (printf "tag %s" (takeBaseName doc.iFilePath))
+              doc
         )
-        docs
+        (map (addTags tags . removeTags untags) docs)
 
-printTags :: Document -> IO ()
+printTags :: D.Document -> IO ()
 printTags doc =
   mapM_
     ( \tag ->
@@ -380,76 +383,36 @@ printTags doc =
           Nothing -> printf "@%s\n" (G.tagKey tag)
           Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue
     )
-    ( filter
-        ( \tag ->
-            -- TODO Handle tags hidden by default
-            --
-            -- - Add `list-default-hide-tags` to `Settings`
-            -- - Add `--show-tags`, `--hide-tags` to `List`
-            not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag)
-              && (G.tagKey tag /= "content")
-              && (G.tagKey tag /= "importedFrom")
-              && (G.tagKey tag /= "language")
-        )
+    ( filter arbitrarilySelectTag
         . S.toList
         $ doc.index.tags `S.union` doc.index.internalTags
     )
 
+arbitrarilySelectTag :: G.Tag -> Bool
+arbitrarilySelectTag tag =
+  -- TODO Handle tags hidden by default
+  --
+  -- - Add `list-default-hide-tags` to `Settings`
+  -- - Add `--show-tags`, `--hide-tags` to `List`
+  not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag)
+    && (G.tagKey tag /= "content")
+    && (G.tagKey tag /= "importedFrom")
+    && (G.tagKey tag /= "language")
+
 doRedoIf :: [G.Filter] -> Bool -> IO ()
 doRedoIf filters redo =
   when redo do
     parMapM_ doRedo
       . filter (G.applyFilters filters . (.index.tags))
-      =<< getDocuments
+      =<< D.getDocuments
   where
     doRedo doc = do
       content <- ocr doc.index.language doc.oFilePath
-      withGit do
-        J.encodeFile doc.iFilePath doc.index {content = content}
-        commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath))
-
-data Document = Document
-  { iFileName :: String,
-    index :: Index
-  }
-  deriving (Show)
-
-instance HasField "oFilePath" Document FilePath where
-  getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
-
-instance HasField "iFilePath" Document FilePath where
-  getField doc = "index" </> doc.iFileName
-
-hasWord :: T.Text -> Document -> Bool
-hasWord word doc = S.member word doc.index.originalWords
-
-hasTag :: G.Tag -> Document -> Bool
-hasTag tag doc = S.member tag doc.index.tags
-
-hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
-hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
-
-tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
-tagValues docs =
-  M.unionsWith S.union $
-    mapMaybe
-      ( \tag ->
-          M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
-      )
-      (S.toList (S.unions (map (.index.tags) docs)))
+      let doc' = doc {D.index = doc.index {D.content = content}}
+      R.replaceDocument (printf "redo %s" (takeBaseName doc.iFilePath)) doc'
 
-getDocuments :: IO [Document]
-getDocuments =
-  parMapM readDocument
-    =<< sort <$> listDirectory "index"
-
-readDocument :: FilePath -> IO Document
-readDocument iFileName =
-  Document iFileName
-    <$> decodeFile @Index ("index" </> iFileName)
-
-processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document]
-processDocumentsInteractively settings allDocs docs =
+processDocumentsInteractively :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document]
+processDocumentsInteractively settings probabilityCache docs = do
   mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs)
   where
     numDocs = length docs
@@ -478,13 +441,13 @@ processDocumentsInteractively settings allDocs docs =
           printf "%s\n" (takeBaseName doc.iFileName)
           printf "%s\n" doc.index.content
           processDocumentInteractively n doc
-        "p" -> tagDocumentInteractively settings allDocs doc
+        "p" -> tagDocumentInteractively settings probabilityCache doc
         "s" -> pure doc
         "v" -> do
           viewDocuments [doc]
           processDocumentInteractively n doc
 
-viewDocuments :: [Document] -> IO ()
+viewDocuments :: [D.Document] -> IO ()
 viewDocuments docs =
   sh_
     ( "zathura "
@@ -494,7 +457,7 @@ viewDocuments docs =
            )
     )
 
-editDocuments :: [Document] -> IO ()
+editDocuments :: [D.Document] -> IO ()
 editDocuments docs =
   withSystemTempDirectory "apaperless" $ \tmp -> do
     let fp doc = tmp </> takeBaseName doc.iFileName <.> "txt"
@@ -513,63 +476,49 @@ editDocuments docs =
     mapM_
       ( \doc -> do
           content <- T.readFile (fp doc)
-          withGit do
-            J.encodeFile doc.iFilePath doc.index {content = content}
-            commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath))
+          let doc' = doc {D.index = doc.index {D.content = content}}
+          R.replaceDocument (printf "edit %s" (takeBaseName doc.iFilePath)) doc'
           pure content
       )
       docs
 
-type ProbabilityMap = [(T.Text, G.Tag, Double)]
-
--- TODO Consider words that contribute to NOT adding a tag
---
--- If there is a document that should, say, not have a `@correspondent`, we should score words that contribute to that fact as well.
---
--- @topic probability-map
-probabilityMap :: [Document] -> ProbabilityMap
-probabilityMap allDocs =
-  [ let docsWithWord = filter (hasWord word) allDocs
-        docsWithTag = filter (hasTag tag) allDocs
-        docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs
-        p =
-          fi (length docsWithWordAndTag)
-            / fi
-              ( length docsWithWord
-                  + length docsWithTag
-                  - length docsWithWordAndTag
-              )
-     in (word, tag, p)
-    | word <- S.toList allWords,
-      tag <- S.toList allTags
-  ]
-  where
-    allTags = foldl S.union S.empty (map (.index.tags) allDocs)
-    allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
-    fi = fromIntegral @Int @Double
-
-suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])]
-suggestTags settings allDocs doc = do
-  forM settings.suggestedTags $ \suggestedTag -> do
+suggestTags :: S.Settings -> C.ProbabilityCache -> D.Document -> [(G.Tag, [G.Tag])]
+suggestTags settings probabilityCache doc = do
+  flip map settings.suggestedTags $ \suggestedTag ->
     case suggestedTag of
-      S.SuggestTagByRE tagName pattern -> do
+      S.SuggestTagByRE tagName pattern ->
         let tagValues =
-              nub . catMaybes . map R.matchedText . R.allMatches $
-                doc.index.content
-                  R.*=~ pattern
-        pure (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues)
-      S.SuggestTagByTags tagName -> do
+              nub
+                . mapMaybe
+                  ( -- XXX Whys is this so complicated?
+                    \match -> do
+                      captures <- snd <$> R.matchCaptures match
+                      pure
+                        ( foldl
+                            ( \template (captureName, captureOrdinal) ->
+                                T.replace
+                                  ("${" <> R.getCaptureName captureName <> "}")
+                                  (R.capturedText (captures !! (fromEnum captureOrdinal - 1)))
+                                  template
+                            )
+                            (R.getTemplate pattern)
+                            (HM.toList $ R.captureNames match)
+                        )
+                  )
+                $ R.allMatches (doc.index.content R.*=~ (R.getSearch pattern))
+         in (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues)
+      S.SuggestTagByTags tagName ->
         let tagValues =
-              -- TODO Cache `probabilityMap`
+              -- TODO Cache `probabilityCache`
               --
               -- @topic probability-map
-              probabilityMap allDocs
-                & filter
-                  ( \(word, tag, _) ->
-                      G.tagKey tag == tagName && hasWord word doc
+              probabilityCache.probabilityMap
+                & M.filterWithKey
+                  ( \(word, tag) _ ->
+                      G.tagKey tag == tagName && D.hasWord word doc
                   )
-                & foldl'
-                  ( \scorePerTagValue (_, tag, p) ->
+                & M.foldlWithKey'
+                  ( \scorePerTagValue (_, tag) p ->
                       M.insertWith
                         (+)
                         (G.tagValue tag)
@@ -580,7 +529,7 @@ suggestTags settings allDocs doc = do
                 & M.toList
                 & sortBy (comparing (negate . snd))
                 & map fst
-        pure (G.tag tagName (Just ""), map (G.tag tagName) tagValues)
+         in (G.tag tagName (Just ""), map (G.tag tagName) tagValues)
 
 autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag]
 autoApplySuggestedTags =
@@ -591,28 +540,26 @@ autoApplySuggestedTags =
           else Just (head suggestedTags)
     )
 
-processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document]
-processDocuments settings allDocs docs =
+processDocuments :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document]
+processDocuments settings probabilityCache docs =
   mapM processDocument docs
   where
     processDocument doc = do
-      tags <-
-        S.fromList . autoApplySuggestedTags
-          <$> suggestTags settings allDocs doc
+      let tags =
+            S.fromList . autoApplySuggestedTags $
+              suggestTags settings probabilityCache doc
       let doc' =
             doc
-              { index =
+              { D.index =
                   doc.index
-                    { tags = doc.index.tags `S.union` tags,
-                      todo = False
+                    { D.tags = doc.index.tags `S.union` tags,
+                      D.todo = False
                     }
               }
-      withGit do
-        J.encodeFile doc.iFilePath doc'.index
-        commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath)
+      R.replaceDocument (printf "process %s (auto)" doc.iFilePath) doc'
       pure doc'
 
-applyTags :: [Either T.Text G.Tag] -> Document -> Document
+applyTags :: [Either T.Text G.Tag] -> D.Document -> D.Document
 applyTags tags' doc = do
   addTags tags (removeTags untags doc)
   where
@@ -625,43 +572,43 @@ data AddTagException = AddTagException G.Tag
 
 instance Exception AddTagException
 
-addTags :: [G.Tag] -> Document -> Document
+addTags :: [G.Tag] -> D.Document -> D.Document
 addTags =
   flip (foldl (flip addTag))
 
-addTag :: G.Tag -> Document -> Document
+addTag :: G.Tag -> D.Document -> D.Document
 addTag tag doc =
   if
       | G.tagKey tag == "todo" ->
-          doc {index = doc.index {todo = True}}
+          doc {D.index = doc.index {D.todo = True}}
       | G.tagKey tag == "addedAt" ->
           maybe
             (throw (AddTagException tag))
             ( \addedAt ->
-                doc {index = doc.index {addedAt = addedAt}}
+                doc {D.index = doc.index {D.addedAt = addedAt}}
             )
             (iso8601ParseM . T.unpack =<< (G.tagValue tag))
       | G.tagKey tag == "modifiedAt" ->
           maybe
             (throw (AddTagException tag))
             ( \modifiedAt ->
-                doc {index = doc.index {modifiedAt = Just modifiedAt}}
+                doc {D.index = doc.index {D.modifiedAt = Just modifiedAt}}
             )
             (iso8601ParseM . T.unpack =<< (G.tagValue tag))
       | G.tagKey tag == "content" ->
           maybe
             (throw (AddTagException tag))
             ( \content ->
-                doc {index = doc.index {content = content}}
+                doc {D.index = doc.index {D.content = content}}
             )
             (G.tagValue tag)
       | G.tagKey tag == "language" ->
           throw (AddTagException tag)
       | otherwise ->
           doc
-            { index =
+            { D.index =
                 doc.index
-                  { tags = G.replace tag doc.index.tags
+                  { D.tags = G.replace tag doc.index.tags
                   }
             }
 
@@ -670,15 +617,15 @@ data RemoveTagException = RemoveTagException G.Tag
 
 instance Exception RemoveTagException
 
-removeTags :: [G.Tag] -> Document -> Document
+removeTags :: [G.Tag] -> D.Document -> D.Document
 removeTags =
   flip (foldl (flip removeTag))
 
-removeTag :: G.Tag -> Document -> Document
+removeTag :: G.Tag -> D.Document -> D.Document
 removeTag tag doc =
   if
       | G.tagKey tag == "todo" ->
-          doc {index = doc.index {todo = False}}
+          doc {D.index = doc.index {D.todo = False}}
       | G.tagKey tag == "addedAt" ->
           throw (RemoveTagException tag)
       | G.tagKey tag == "modifiedAt" ->
@@ -689,9 +636,9 @@ removeTag tag doc =
           throw (RemoveTagException tag)
       | otherwise ->
           doc
-            { index =
+            { D.index =
                 doc.index
-                  { tags =
+                  { D.tags =
                       maybe
                         (G.deleteAll (G.tagKey tag))
                         (\_ -> G.delete tag)
@@ -700,18 +647,16 @@ removeTag tag doc =
                   }
             }
 
-tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document
-tagDocumentInteractively settings allDocs doc = do
-  suggestedTags <- suggestTags settings allDocs doc
+tagDocumentInteractively :: S.Settings -> C.ProbabilityCache -> D.Document -> IO D.Document
+tagDocumentInteractively settings probabilityCache doc = do
+  let suggestedTags = suggestTags settings probabilityCache doc
   tags <- mapM (uncurry tagDocumentInteractively') suggestedTags
   let doc' =
         (applyTags tags doc)
-          { index = doc.index {todo = False}
+          { D.index = doc.index {D.todo = False}
           }
-  withGit do
-    J.encodeFile doc.iFilePath doc'.index
-    commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath)
-    pure doc'
+  R.replaceDocument (printf "process %s (interactive)" doc.iFilePath) doc'
+  pure doc'
   where
     tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag)
     tagDocumentInteractively' tag tags
@@ -770,30 +715,19 @@ consume1 language force keep filePath = do
       then ocr language filePath
       else pure content'
   addedAt <- getCurrentTime
-  withGit do
-    J.encodeFile
-      iFilePath
-      Index
-        { tags = S.empty,
-          todo = True,
-          modifiedAt = Nothing,
-          ..
-        }
-    if keep
-      then copyFile filePath oFilePath
-      else renameFile filePath oFilePath
-    commitAll
-      [iFilePath, oFilePath]
-      (printf "add %s" (takeFileName filePath))
-  pure (takeBaseName iFilePath)
-
-withGit :: IO a -> IO a
-withGit = withLockFile def ".gitlock"
+  let doc =
+        D.Document (fKey <.> "json") $
+          D.Index
+            { tags = S.empty,
+              todo = True,
+              modifiedAt = Nothing,
+              ..
+            }
+  R.commitDocument (printf "add %s" (takeFileName filePath)) filePath doc
+  when (not keep) do
+    removeFile filePath
 
-commitAll :: [FilePath] -> String -> IO ()
-commitAll fps m = do
-  sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps))
-  sh_ (printf ">/dev/null git commit -m '%s' || :" m)
+  pure (takeBaseName iFilePath)
 
 data DecodeException = DecodeException FilePath String
   deriving (Show)
@@ -839,49 +773,6 @@ ocr1 language tmp input =
     -- XXX `--oem 1` seems to be unavailable
     <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language)
 
-data Index = Index
-  { content :: T.Text,
-    tags :: S.Set G.Tag,
-    addedAt :: UTCTime,
-    modifiedAt :: Maybe UTCTime,
-    todo :: Bool,
-    language :: String
-  }
-  deriving (Show, Generic, Eq)
-
-instance J.ToJSON Index
-
-instance J.FromJSON Index
-
-instance HasField "shortText" Index T.Text where
-  getField =
-    (T.unlines . take 10 . T.lines)
-      . (.content)
-
-instance HasField "internalTags" Index (S.Set G.Tag) where
-  getField index =
-    index.tags
-      `S.union` internalTags index
-
--- TODO Cache `originalWords`
---
--- @related cache-probabilitymap
-instance HasField "originalWords" Index (S.Set T.Text) where
-  getField index =
-    S.fromList (T.words index.content)
-
-internalTags :: Index -> S.Set G.Tag
-internalTags index =
-  S.fromList
-    ( concat
-        [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
-            G.tag "language" (Just (T.pack index.language))
-          ],
-          maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
-          if index.todo then [G.tag "todo" Nothing] else []
-        ]
-    )
-
 data PdfInfo = PdfInfo
   { numPages :: Int,
     pageSize :: (Double, Double)
diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs
new file mode 100644
index 0000000..9a700d7
--- /dev/null
+++ b/app/ProbabilityMap.hs
@@ -0,0 +1,230 @@
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE StrictData #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE NoFieldSelectors #-}
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+
+module ProbabilityMap
+  ( fileName,
+    ProbabilityCache (..),
+    readProbabilityCache,
+    writeProbabilityCache,
+    modifyProbabilityCache,
+    fromDocuments,
+    addDocument,
+    deleteDocument,
+    replaceDocument,
+  )
+where
+
+import Control.Applicative (liftA2)
+import Control.Arrow ((***))
+import Control.DeepSeq (NFData (..))
+import Control.Exception (SomeException, try)
+import Control.Parallel.Strategies
+import Data.Binary qualified as B
+import Data.List (foldl')
+import Data.Map qualified as M
+import Data.Maybe (fromMaybe)
+import Data.Set qualified as S
+import Data.Text qualified as T
+import Data.Vector qualified as V
+import Data.Vector.Binary ()
+import Document qualified as D
+import GHC.Conc (numCapabilities)
+import GHC.Generics (Generic)
+import Tag qualified as G
+
+data ProbabilityCache = ProbabilityCache
+  { docsPerWord :: M.Map T.Text Int,
+    docsPerTag :: M.Map G.Tag Int,
+    docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int,
+    probabilityMap :: M.Map (T.Text, G.Tag) Double
+  }
+  deriving (Show, Generic)
+
+data ProbabilityCache' = ProbabilityCache'
+  { docsPerWord :: M.Map T.Text Int,
+    docsPerTag :: M.Map G.Tag Int,
+    docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int
+  }
+  deriving (Show, Generic)
+
+instance NFData ProbabilityCache' where
+  rnf (ProbabilityCache' {..}) =
+    rnf docsPerWord `seq`
+      rnf docsPerTag `seq`
+        rnf docsPerWordAndTag
+
+fromProbabilityCache' :: ProbabilityCache' -> ProbabilityCache
+fromProbabilityCache' probabilityCache'@(ProbabilityCache' {..}) =
+  ProbabilityCache {probabilityMap = probabilityMap probabilityCache', ..}
+
+toProbabilityCache' :: ProbabilityCache -> ProbabilityCache'
+toProbabilityCache' (ProbabilityCache {..}) =
+  ProbabilityCache' {..}
+
+instance B.Binary ProbabilityCache where
+  get = fromCacheRep <$> B.get
+  put = B.put . toCacheRep
+
+instance NFData ProbabilityCache where
+  rnf (ProbabilityCache {..}) =
+    rnf docsPerWord `seq`
+      rnf docsPerTag `seq`
+        rnf docsPerWordAndTag `seq`
+          rnf probabilityMap
+
+data CacheRep = CacheRep
+  { allWords :: V.Vector T.Text,
+    allTags :: V.Vector G.Tag,
+    docsPerWord :: M.Map Int Int,
+    docsPerTag :: M.Map Int Int,
+    docsPerWordAndTag :: M.Map (Int, Int) Int,
+    probabilityMap :: M.Map (Int, Int) Double
+  }
+  deriving (Show, Generic, B.Binary)
+
+instance NFData CacheRep where
+  rnf (CacheRep {..}) =
+    rnf allWords `seq`
+      rnf allTags `seq`
+        rnf docsPerWord `seq`
+          rnf docsPerTag `seq`
+            rnf docsPerWordAndTag `seq`
+              rnf probabilityMap
+
+fromCacheRep :: CacheRep -> ProbabilityCache
+fromCacheRep (CacheRep {..}) =
+  ProbabilityCache
+    { docsPerWord = M.mapKeys fromWord docsPerWord,
+      docsPerTag = M.mapKeys fromTag docsPerTag,
+      docsPerWordAndTag = M.mapKeys fromWordAndTag docsPerWordAndTag,
+      probabilityMap = M.mapKeys fromWordAndTag probabilityMap
+    }
+  where
+    fromWord = (allWords V.!)
+    fromTag = (allTags V.!)
+    fromWordAndTag = fromWord *** fromTag
+
+toCacheRep :: ProbabilityCache -> CacheRep
+toCacheRep (ProbabilityCache {..}) =
+  CacheRep
+    { allWords = V.fromList allWords',
+      allTags = V.fromList allTags',
+      docsPerWord = M.mapKeys toWord docsPerWord,
+      docsPerTag = M.mapKeys toTag docsPerTag,
+      docsPerWordAndTag = M.mapKeys toWordAndTag docsPerWordAndTag,
+      probabilityMap = M.mapKeys toWordAndTag probabilityMap
+    }
+  where
+    allWords' = M.keys docsPerWord
+    allTags' = M.keys docsPerTag
+    allWordsMap = M.fromList $ zip allWords' [0 ..]
+    allTagsMap = M.fromList $ zip allTags' [0 ..]
+    toWord = (allWordsMap M.!)
+    toTag = (allTagsMap M.!)
+    toWordAndTag = toWord *** toTag
+
+fileName :: FilePath
+fileName = "probabilityCache"
+
+readProbabilityCache :: IO ProbabilityCache
+readProbabilityCache =
+  try (B.decodeFile fileName) >>= \case
+    Left (_ :: SomeException) -> fromDocuments <$> D.getDocuments
+    Right probabilityCache -> pure probabilityCache
+
+writeProbabilityCache :: ProbabilityCache -> IO ()
+writeProbabilityCache probabilityCache =
+  B.encodeFile fileName probabilityCache
+
+modifyProbabilityCache :: (ProbabilityCache -> ProbabilityCache) -> IO ProbabilityCache
+modifyProbabilityCache f =
+  liftA2 (>>) (writeProbabilityCache . f) pure =<< readProbabilityCache
+
+fromDocuments :: [D.Document] -> ProbabilityCache
+fromDocuments allDocs =
+  fromProbabilityCache' $
+    foldl' (flip addDocument') (ProbabilityCache' M.empty M.empty M.empty) allDocs
+
+probabilityMap :: ProbabilityCache' -> M.Map (T.Text, G.Tag) Double
+probabilityMap (ProbabilityCache' {..}) =
+  M.unions
+    . withStrategy (parList rdeepseq)
+    . map
+      ( M.fromList
+          . filter (\(_, p) -> p > 0)
+          . map (\k@(word, tag) -> (k, probability word tag))
+      )
+    $ streamsOf
+      numCapabilities
+      [ (word, tag)
+        | word <- allWords,
+          tag <- allTags
+      ]
+  where
+    probability word tag =
+      let docsWithWord = docsPerWord M.! word
+          docsWithTag = docsPerTag M.! tag
+          docsWithWordAndTag = fromMaybe 0 $ M.lookup (word, tag) docsPerWordAndTag
+       in fi docsWithWordAndTag / fi (docsWithWord + docsWithTag - docsWithWordAndTag)
+
+    allWords = M.keys docsPerWord
+    allTags = M.keys docsPerTag
+    fi = fromIntegral @Int @Double
+
+streamsOf :: Int -> [a] -> [[a]]
+streamsOf 1 xs = [xs]
+streamsOf n xs
+  | n > 0 = [everyN k xs | k <- [0 .. n - 1]]
+  | otherwise = []
+  where
+    everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs
+
+addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
+addDocument doc probabilityCache =
+  fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache)
+
+deleteDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
+deleteDocument doc probabilityCache =
+  fromProbabilityCache' $ deleteDocument' doc (toProbabilityCache' probabilityCache)
+
+replaceDocument :: D.Document -> D.Document -> ProbabilityCache -> ProbabilityCache
+replaceDocument oldDoc newDoc probabilityCache =
+  fromProbabilityCache' $
+    (addDocument' newDoc . deleteDocument' oldDoc) (toProbabilityCache' probabilityCache)
+
+addDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache'
+addDocument' doc probabilityCache =
+  combining (+) probabilityCache (fromDocument doc)
+
+deleteDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache'
+deleteDocument' doc probabilityCache =
+  combining (-) probabilityCache (fromDocument doc)
+
+combining :: (Int -> Int -> Int) -> ProbabilityCache' -> ProbabilityCache' -> ProbabilityCache'
+combining f probabilityCache probabilityCache' =
+  ProbabilityCache' {..}
+  where
+    docsPerWord = M.unionWith f probabilityCache.docsPerWord probabilityCache'.docsPerWord
+    docsPerTag = M.unionWith f probabilityCache.docsPerTag probabilityCache'.docsPerTag
+    docsPerWordAndTag = M.unionWith f probabilityCache.docsPerWordAndTag probabilityCache'.docsPerWordAndTag
+
+fromDocument :: D.Document -> ProbabilityCache'
+fromDocument doc =
+  let words = S.toList doc.index.originalWords
+      tags = S.toList doc.index.tags
+      wordAndTags = [(word, tag) | word <- words, tag <- tags]
+   in ProbabilityCache'
+        { docsPerWord = docsPer words,
+          docsPerTag = docsPer tags,
+          docsPerWordAndTag = docsPer wordAndTags
+        }
+  where
+    docsPer :: Ord k => [k] -> M.Map k Int
+    docsPer = M.fromList . map (flip (,) 1)
diff --git a/app/Settings.hs b/app/Settings.hs
index 08b40bb..43f721c 100644
--- a/app/Settings.hs
+++ b/app/Settings.hs
@@ -16,6 +16,7 @@ import Data.Aeson qualified as A
 import Data.Function (on)
 import Data.List (nub)
 import Data.Text qualified as T
+import Data.Vector qualified as V
 import Data.Yaml (decodeFileThrow, encodeFile)
 import GHC.Generics (Generic)
 import System.Directory (doesFileExist)
@@ -47,7 +48,9 @@ instance Monoid Settings where
             -- -   perform minor corrections on OCR, ie. parse `0 1.01.1970`
             --
             -- @topic suggested-tags
-            SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|],
+            SuggestTagByRE
+              "createdAt"
+              [R.ed|${d}([0-9]{2})\.${m}([0-9]{2})\.${y}([0-9]{4})///${y}-${m}-${d}|],
             SuggestTagByTags "correspondent"
           ],
         defaultLanguage = "deu+eng"
@@ -58,7 +61,7 @@ instance A.FromJSON Settings
 instance A.ToJSON Settings
 
 data SuggestedTag
-  = SuggestTagByRE T.Text R.RE
+  = SuggestTagByRE T.Text (R.SearchReplace R.RE T.Text)
   | SuggestTagByTags T.Text
   deriving (Show, Generic, Eq)
 
@@ -68,6 +71,10 @@ instance Show R.RE where
 instance Eq R.RE where
   (==) = (==) `on` show
 
+instance (Eq re, Eq s) => Eq (R.SearchReplace re s) where
+  a == b =
+    ((==) `on` R.getSearch) a b && (((==) `on` R.getTemplate) a b)
+
 instance A.FromJSON SuggestedTag
 
 instance A.ToJSON SuggestedTag
@@ -78,6 +85,20 @@ instance A.FromJSON R.RE where
 instance A.ToJSON R.RE where
   toJSON = A.toJSON . R.reSource
 
+instance (A.FromJSON re, A.FromJSON s) => A.FromJSON (R.SearchReplace re s) where
+  parseJSON =
+    A.withArray
+      "regular search/replace expression (POSIX)"
+      ( \a -> do
+          re <- a `V.indexM` 0
+          s <- a `V.indexM` 1
+          R.SearchReplace <$> A.parseJSON re <*> A.parseJSON s
+      )
+
+instance (A.ToJSON re, A.ToJSON s) => A.ToJSON (R.SearchReplace re s) where
+  toJSON (R.SearchReplace re s) =
+    A.Array (V.fromList [A.toJSON re, A.toJSON s])
+
 readSettings :: IO Settings
 readSettings =
   fmap mconcat
diff --git a/app/Store.hs b/app/Store.hs
new file mode 100644
index 0000000..7a4637f
--- /dev/null
+++ b/app/Store.hs
@@ -0,0 +1,60 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+
+module Store
+  ( commitDocument,
+    replaceDocument,
+  )
+where
+
+import Control.Exception (Exception, throwIO)
+import Data.Aeson qualified as J
+import Data.ByteString.Lazy qualified as LB
+import Data.Default (Default (def))
+import Data.List (intercalate)
+import Data.String (IsString (fromString))
+import Document qualified as D
+import ProbabilityMap qualified as C
+import System.Directory (copyFile)
+import System.Exit (ExitCode (ExitFailure, ExitSuccess))
+import System.IO.LockFile (withLockFile)
+import System.Process.Typed (readProcessStderr)
+import Text.Printf (printf)
+
+replaceDocument :: String -> D.Document -> IO ()
+replaceDocument message doc = do
+  withGit do
+    J.encodeFile doc.iFilePath doc.index
+    oldDoc <- D.readDocument doc.iFileName
+    _ <- C.modifyProbabilityCache (C.replaceDocument oldDoc doc)
+    commitAll [doc.iFilePath, C.fileName] message
+
+commitDocument :: String -> FilePath -> D.Document -> IO ()
+commitDocument message original doc = do
+  withGit do
+    copyFile original doc.oFilePath
+    J.encodeFile doc.iFilePath doc.index
+    _ <- C.modifyProbabilityCache (C.addDocument doc)
+    commitAll [doc.iFilePath, doc.oFilePath, C.fileName] message
+
+withGit :: IO a -> IO a
+withGit = withLockFile def ".gitlock"
+
+commitAll :: [FilePath] -> String -> IO ()
+commitAll fps m = do
+  sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps))
+  sh_ (printf ">/dev/null git commit -m '%s' || :" m)
+
+-- TODO Refacotor library `sh`
+sh_ :: String -> IO ()
+sh_ cmd = do
+  -- printf "+ %s\n" cmd
+  (exitCode, err) <- readProcessStderr (fromString cmd)
+  case exitCode of
+    ExitSuccess -> return ()
+    ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err
+
+data ProcessException = ProcessException Int LB.ByteString
+  deriving (Show)
+
+instance Exception ProcessException
diff --git a/tags/src/Tag.hs b/tags/src/Tag.hs
index f7f3398..882f1de 100644
--- a/tags/src/Tag.hs
+++ b/tags/src/Tag.hs
@@ -51,6 +51,7 @@ module Tag
 where
 
 import Control.Applicative ((<|>))
+import Control.DeepSeq (NFData (rnf))
 import Data.Aeson qualified as J
 import Data.Attoparsec.Text qualified as A
 import Data.Binary (Binary)
@@ -66,6 +67,9 @@ import TypedValue (cast, castDef)
 data Tag = Tag T.Text (Maybe T.Text)
   deriving (Show, Generic, Binary, Eq, Ord)
 
+instance NFData Tag where
+  rnf (Tag k v) = rnf k `seq` rnf v
+
 tag :: T.Text -> Maybe T.Text -> Tag
 tag = Tag
 
diff --git a/tags/tags.cabal b/tags/tags.cabal
index 0149e74..c93aedd 100644
--- a/tags/tags.cabal
+++ b/tags/tags.cabal
@@ -26,7 +26,8 @@ library
       containers,
       regex,
       text,
-      time
+      time,
+      deepseq
     hs-source-dirs: src
     default-language: GHC2021
     default-extensions:
author	Alexander Foremny <aforemny@posteo.de>	2024-01-11 03:20:56 +0100
committer	Alexander Foremny <aforemny@posteo.de>	2024-01-11 03:26:52 +0100
commit	7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (patch)
tree	3ee48fc98f98ab7ac7ad19e24334e07b8b147dd6
parent	673c59d9be8b62106ffbba96d805680f0b5e7e3f (diff)