From 4089b57dda84ce907046c7d47c44f75711310e23 Mon Sep 17 00:00:00 2001
From: Alexander Foremny <aforemny@posteo.de>
Date: Tue, 26 Dec 2023 05:39:29 +0100
Subject: chore: add `SuggestTagByTags`

This naive implementation scores tags based on the words that correlate
to a tag being set. It then uses that score to determine the
highest-scoring value for a tag based on the words in a document.
---
 app/Main.hs     | 70 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 app/Settings.hs |  4 +++-
 2 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'app')

diff --git a/app/Main.hs b/app/Main.hs
index adb16b4..096f253 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE BlockArguments #-}
 {-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DuplicateRecordFields #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE OverloadedRecordDot #-}
 {-# LANGUAGE OverloadedStrings #-}
@@ -25,6 +26,7 @@ import Data.List
 import Data.List.NonEmpty qualified as N
 import Data.Map qualified as M
 import Data.Maybe (catMaybes, fromMaybe, mapMaybe)
+import Data.Ord (comparing)
 import Data.Set qualified as S
 import Data.String (IsString (fromString))
 import Data.Text qualified as T
@@ -291,7 +293,7 @@ printTags doc =
           Nothing -> printf "@%s\n" (tagKey tag)
           Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue
     )
-    doc.index.tags
+    (doc.index.tags `S.union` doc.index.internalTags)
 
 doRedoIf :: [Filter] -> Bool -> IO ()
 doRedoIf filters redo =
@@ -434,6 +436,45 @@ suggestTags settings allDocs doc = do
                 doc.index.originalText
                   R.*=~ pattern
         pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues)
+      S.SuggestTagByTags tagName -> do
+        let allTags = foldl S.union S.empty (map (.index.tags) allDocs)
+            allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
+
+            hasWord word doc = S.member word doc.index.originalWords
+            hasTag tag doc = S.member tag doc.index.tags
+            hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+            fi = fromIntegral
+
+            -- TODO Consider words that contribute to NOT adding a tag
+            --
+            -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well.
+            --
+            -- @topic probability-map
+
+            -- TODO Cache `probabilityMap`
+            probabilityMap =
+              [ let docs = filter (hasWordAndTag word tag) allDocs
+                    p = fi (length docs) / fi (length allDocs)
+                 in (word, tag, p)
+                | word <- S.toList allWords,
+                  tag <- S.toList allTags
+              ]
+        let tagValues =
+              probabilityMap
+                & filter (\(word, tag, _) -> hasWordAndTag word tag doc)
+                & foldl'
+                  ( \scorePerTagValue (_, tag, p) ->
+                      M.insertWith
+                        (+)
+                        (tagValue tag)
+                        p
+                        scorePerTagValue
+                  )
+                  M.empty
+                & M.toList
+                & sortBy (comparing (negate . snd))
+                & map fst
+        pure (Tag tagName (Just ""), map (Tag tagName) tagValues)
 
 autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag]
 autoApplySuggestedTags =
@@ -456,8 +497,8 @@ processDocuments settings allDocs docs =
             doc
               { index =
                   doc.index
-                    { tags =
-                        S.delete (Tag "todo" Nothing) (doc.index.tags `S.union` tags)
+                    { tags = doc.index.tags `S.union` tags,
+                      todo = False
                     }
               }
       withGit do
@@ -475,8 +516,8 @@ tagDocumentInteractively settings allDocs doc = do
         doc
           { index =
               doc.index
-                { tags =
-                    S.delete (Tag "todo" Nothing) (doc.index.tags `S.union` tags)
+                { tags = doc.index.tags `S.union` tags,
+                  todo = False
                 }
           }
   withGit do
@@ -536,7 +577,7 @@ consume1 force keep filePath = do
       else pure originalText'
   addedAt <- getCurrentTime
   withGit do
-    J.encodeFile iFilePath Index {tags = S.singleton (Tag "todo" Nothing), ..}
+    J.encodeFile iFilePath Index {tags = S.empty, todo = True, ..}
     if keep
       then copyFile filePath oFilePath
       else renameFile filePath oFilePath
@@ -600,7 +641,8 @@ ocr1 tmp input =
 data Index = Index
   { originalText :: T.Text,
     tags :: S.Set Tag,
-    addedAt :: UTCTime
+    addedAt :: UTCTime,
+    todo :: Bool
   }
   deriving (Show, Generic, Eq)
 
@@ -618,11 +660,21 @@ instance HasField "internalTags" Index (S.Set Tag) where
     index.tags
       `S.union` internalTags index
 
+-- TODO Cache `originalWords`
+--
+-- @related cache-probabilitymap
+instance HasField "originalWords" Index (S.Set T.Text) where
+  getField index =
+    S.fromList (T.words index.originalText)
+
 internalTags :: Index -> S.Set Tag
 internalTags index =
   S.fromList
-    [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))
-    ]
+    ( concat
+        [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))],
+          if index.todo then [Tag "todo" Nothing] else []
+        ]
+    )
 
 data PdfInfo = PdfInfo
   { numPages :: Int,
diff --git a/app/Settings.hs b/app/Settings.hs
index 5d4d55f..39e8e35 100644
--- a/app/Settings.hs
+++ b/app/Settings.hs
@@ -43,7 +43,8 @@ instance Monoid Settings where
             --
             -- -   date formats that spell out the month name, ie. `1 Januar 1970` or `1 Jan 1970`
             -- -   perform minor corrections on OCR, ie. parse `0 1.01.1970`
-            SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|]
+            SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|],
+            SuggestTagByTags "correspondent"
           ]
       }
 
@@ -53,6 +54,7 @@ instance A.ToJSON Settings
 
 data SuggestedTag
   = SuggestTagByRE T.Text R.RE
+  | SuggestTagByTags T.Text
   deriving (Show, Generic, Eq)
 
 instance Show R.RE where
-- 
cgit v1.2.3