From ddc2c9a43b035ce8eff68aa6e3c7dc2b37511ed5 Mon Sep 17 00:00:00 2001
From: Alexander Foremny <aforemny@posteo.de>
Date: Tue, 26 Dec 2023 07:23:16 +0100
Subject: feat: add `topwords` command

Changes the heuristic for the probability map.
---
 app/Main.hs | 135 +++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 87 insertions(+), 48 deletions(-)

diff --git a/app/Main.hs b/app/Main.hs
index 5eb661d..47eeedd 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -48,7 +48,7 @@ import System.FilePath
 import System.IO.LockFile (withLockFile)
 import System.IO.Temp (withSystemTempDirectory)
 import System.Process.Typed
-import Tag
+import Tag qualified as G
 import Text.Printf (printf)
 import Text.RE.TDFA.Text qualified as R
 import Text.Read (readMaybe)
@@ -79,6 +79,7 @@ data Cmd
   | View
       { indexNames :: [FilePath]
       }
+  | TopWords
 
 args :: O.Parser Args
 args =
@@ -96,7 +97,9 @@ cmd =
       O.command "todo" . O.info todoCmd $
         O.progDesc "Interactively process new documents",
       O.command "view" . O.info viewCmd $
-        O.progDesc "View document(s)"
+        O.progDesc "View document(s)",
+      O.command "topwords" . O.info topWordsCmd $
+        O.progDesc "View probability map per tag"
     ]
 
 consumeCmd :: O.Parser Cmd
@@ -131,6 +134,10 @@ viewCmd =
   View
     <$> indexNamesArg
 
+topWordsCmd :: O.Parser Cmd
+topWordsCmd =
+  pure TopWords
+
 inputsArg :: O.Parser [FilePath]
 inputsArg =
   O.many
@@ -304,14 +311,30 @@ main = do
     Args {cmd = View {indexNames}} -> do
       viewDocuments
         =<< mapM (readDocument . (<.> "json")) indexNames
+    Args {cmd = TopWords} -> do
+      allDocs <- getDocuments
+      mapM_
+        ( \(tag, xs) -> do
+            print tag
+            mapM_ (\(word, p) -> printf "  %s: %.4f\n" word p) xs
+        )
+        $ map (second (sortBy (comparing (negate . snd))))
+        $ map (second (filter ((> 0) . snd)))
+        $ M.toList
+        $ foldl
+          ( \wordProbabilityPerTag (word, tag, p) ->
+              M.insertWith (++) tag [(word, p)] wordProbabilityPerTag
+          )
+          M.empty
+          (probabilityMap allDocs)
 
 printTags :: Document -> IO ()
 printTags doc =
   mapM_
     ( \tag ->
-        case tagValue tag of
-          Nothing -> printf "@%s\n" (tagKey tag)
-          Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue
+        case G.tagValue tag of
+          Nothing -> printf "@%s\n" (G.tagKey tag)
+          Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue
     )
     (doc.index.tags `S.union` doc.index.internalTags)
 
@@ -340,11 +363,20 @@ instance HasField "oFilePath" Document FilePath where
 instance HasField "iFilePath" Document FilePath where
   getField doc = "index" </> doc.iFileName
 
+hasWord :: T.Text -> Document -> Bool
+hasWord word doc = S.member word doc.index.originalWords
+
+hasTag :: G.Tag -> Document -> Bool
+hasTag tag doc = S.member tag doc.index.tags
+
+hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
+hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+
 tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
 tagValues docs =
   M.unionsWith S.union $
     mapMaybe
-      ( \(Tag tagKey tagValue) ->
+      ( \(G.Tag tagKey tagValue) ->
           M.singleton tagKey . S.singleton <$> tagValue
       )
       (S.toList (S.unions (map (.index.tags) docs)))
@@ -364,7 +396,7 @@ applyFilters filters = filter (pred filters) `at` (.index.internalTags)
   where
     pred1 (Filter Include filter') = pred1' filter'
     pred1 (Filter Exclude filter') = not . pred1' filter'
-    pred1' (FilterByTag tagKey) = hasTag (Tag tagKey Nothing)
+    pred1' (FilterByTag tagKey) = G.hasTag (G.Tag tagKey Nothing)
     pred filters = \index -> all ($ index) (map pred1 filters)
 
     at :: ([a] -> [a]) -> (b -> a) -> [b] -> [b]
@@ -445,7 +477,35 @@ editDocuments docs =
       )
       docs
 
-suggestTags :: S.Settings -> [Document] -> Document -> IO [(Tag, [Tag])]
+type ProbabilityMap = [(T.Text, G.Tag, Double)]
+
+-- TODO Consider words that contribute to NOT adding a tag
+--
+-- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well.
+--
+-- @topic probability-map
+probabilityMap :: [Document] -> ProbabilityMap
+probabilityMap allDocs =
+  [ let docsWithWord = filter (hasWord word) allDocs
+        docsWithTag = filter (hasTag tag) allDocs
+        docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs
+        p =
+          fi (length docsWithWordAndTag)
+            / fi
+              ( length docsWithWord
+                  + length docsWithTag
+                  - length docsWithWordAndTag
+              )
+     in (word, tag, p)
+    | word <- S.toList allWords,
+      tag <- S.toList allTags
+  ]
+  where
+    allTags = foldl S.union S.empty (map (.index.tags) allDocs)
+    allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
+    fi = fromIntegral @Int @Double
+
+suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])]
 suggestTags settings allDocs doc = do
   forM settings.suggestedTags $ \suggestedTag -> do
     case suggestedTag of
@@ -454,41 +514,20 @@ suggestTags settings allDocs doc = do
               nub . catMaybes . map R.matchedText . R.allMatches $
                 doc.index.originalText
                   R.*=~ pattern
-        pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues)
+        pure (G.Tag tagName (Just ""), map (G.Tag tagName . Just) tagValues)
       S.SuggestTagByTags tagName -> do
-        let allTags = foldl S.union S.empty (map (.index.tags) allDocs)
-            allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
-
-            hasWord word doc = S.member word doc.index.originalWords
-            hasTag tag doc = S.member tag doc.index.tags
-            hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
-            fi = fromIntegral @Int @Double
-
-            -- TODO Consider words that contribute to NOT adding a tag
-            --
-            -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well.
-            --
-            -- @topic probability-map
-
-            -- TODO Cache `probabilityMap`
-            probabilityMap =
-              [ let docs = filter (hasWordAndTag word tag) allDocs
-                    p = fi (length docs) / fi (length allDocs)
-                 in (word, tag, p)
-                | word <- S.toList allWords,
-                  tag <- S.toList allTags
-              ]
         let tagValues =
-              probabilityMap
+              -- TODO Cache `probabilityMap`
+              probabilityMap allDocs
                 & filter
                   ( \(word, tag, _) ->
-                      tagKey tag == tagName && hasWord word doc
+                      G.tagKey tag == tagName && hasWord word doc
                   )
                 & foldl'
                   ( \scorePerTagValue (_, tag, p) ->
                       M.insertWith
                         (+)
-                        (tagValue tag)
+                        (G.tagValue tag)
                         p
                         scorePerTagValue
                   )
@@ -496,9 +535,9 @@ suggestTags settings allDocs doc = do
                 & M.toList
                 & sortBy (comparing (negate . snd))
                 & map fst
-        pure (Tag tagName (Just ""), map (Tag tagName) tagValues)
+        pure (G.Tag tagName (Just ""), map (G.Tag tagName) tagValues)
 
-autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag]
+autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag]
 autoApplySuggestedTags =
   mapMaybe
     ( \(_, suggestedTags) ->
@@ -540,7 +579,7 @@ tagDocumentInteractively settings allDocs doc = do
               doc.index
                 { tags =
                     S.filter
-                      (not . (`elem` tagsToRemove) . tagKey)
+                      (not . (`elem` tagsToRemove) . G.tagKey)
                       doc.index.tags
                       `S.union` (S.fromList tagsToAdd),
                   todo = False
@@ -551,22 +590,22 @@ tagDocumentInteractively settings allDocs doc = do
     commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath)
     pure doc'
   where
-    tagDocumentInteractively' :: Tag -> [Tag] -> IO (Either T.Text Tag)
-    tagDocumentInteractively' tag@(Tag tagKey Nothing) _ = do
+    tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag)
+    tagDocumentInteractively' tag@(G.Tag tagKey Nothing) _ = do
       choice <-
         P.prompt $
           P.choice (printf "tag with %s?" tagKey) (("n" :: String) N.:| ["y"])
       pure $ if (choice == "y") then Right tag else Left tagKey
-    tagDocumentInteractively' (Tag tagKey (Just _)) tags = do
+    tagDocumentInteractively' (G.Tag tagKey (Just _)) tags = do
       tagValue <-
         fmap T.pack . P.prompt $
           P.string
             (printf "tag with %s?" tagKey)
-            (mapMaybe (fmap T.unpack . tagValue) tags ++ ["-"])
+            (mapMaybe (fmap T.unpack . G.tagValue) tags ++ ["-"])
       pure $
         if tagValue == "-"
           then Left tagKey
-          else Right (Tag tagKey (Just tagValue))
+          else Right (G.Tag tagKey (Just tagValue))
 
 ensureGit :: IO ()
 ensureGit = do
@@ -666,7 +705,7 @@ ocr1 language tmp input =
 
 data Index = Index
   { originalText :: T.Text,
-    tags :: S.Set Tag,
+    tags :: S.Set G.Tag,
     addedAt :: UTCTime,
     todo :: Bool,
     language :: String
@@ -682,7 +721,7 @@ instance HasField "shortText" Index T.Text where
     (T.unlines . take 10 . T.lines)
       . (.originalText)
 
-instance HasField "internalTags" Index (S.Set Tag) where
+instance HasField "internalTags" Index (S.Set G.Tag) where
   getField index =
     index.tags
       `S.union` internalTags index
@@ -694,14 +733,14 @@ instance HasField "originalWords" Index (S.Set T.Text) where
   getField index =
     S.fromList (T.words index.originalText)
 
-internalTags :: Index -> S.Set Tag
+internalTags :: Index -> S.Set G.Tag
 internalTags index =
   S.fromList
     ( concat
-        [ [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
-            Tag "language" (Just (T.pack index.language))
+        [ [ G.Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
+            G.Tag "language" (Just (T.pack index.language))
           ],
-          if index.todo then [Tag "todo" Nothing] else []
+          if index.todo then [G.Tag "todo" Nothing] else []
         ]
     )
 
-- 
cgit v1.2.3