summaryrefslogtreecommitdiffstats
path: root/app
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-26 07:23:16 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-26 07:23:17 +0100
commitddc2c9a43b035ce8eff68aa6e3c7dc2b37511ed5 (patch)
tree60bff5875385a9809163c4c22f0685de455d736c /app
parent1a96d382c8de3af9eb28a065d0407ad11151b911 (diff)
feat: add `topwords` command
Changes the heuristic for the probability map.
Diffstat (limited to 'app')
-rw-r--r--app/Main.hs135
1 files changed, 87 insertions, 48 deletions
diff --git a/app/Main.hs b/app/Main.hs
index 5eb661d..47eeedd 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -48,7 +48,7 @@ import System.FilePath
import System.IO.LockFile (withLockFile)
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
-import Tag
+import Tag qualified as G
import Text.Printf (printf)
import Text.RE.TDFA.Text qualified as R
import Text.Read (readMaybe)
@@ -79,6 +79,7 @@ data Cmd
| View
{ indexNames :: [FilePath]
}
+ | TopWords
args :: O.Parser Args
args =
@@ -96,7 +97,9 @@ cmd =
O.command "todo" . O.info todoCmd $
O.progDesc "Interactively process new documents",
O.command "view" . O.info viewCmd $
- O.progDesc "View document(s)"
+ O.progDesc "View document(s)",
+ O.command "topwords" . O.info topWordsCmd $
+ O.progDesc "View probability map per tag"
]
consumeCmd :: O.Parser Cmd
@@ -131,6 +134,10 @@ viewCmd =
View
<$> indexNamesArg
+topWordsCmd :: O.Parser Cmd
+topWordsCmd =
+ pure TopWords
+
inputsArg :: O.Parser [FilePath]
inputsArg =
O.many
@@ -304,14 +311,30 @@ main = do
Args {cmd = View {indexNames}} -> do
viewDocuments
=<< mapM (readDocument . (<.> "json")) indexNames
+ Args {cmd = TopWords} -> do
+ allDocs <- getDocuments
+ mapM_
+ ( \(tag, xs) -> do
+ print tag
+ mapM_ (\(word, p) -> printf " %s: %.4f\n" word p) xs
+ )
+ $ map (second (sortBy (comparing (negate . snd))))
+ $ map (second (filter ((> 0) . snd)))
+ $ M.toList
+ $ foldl
+ ( \wordProbabilityPerTag (word, tag, p) ->
+ M.insertWith (++) tag [(word, p)] wordProbabilityPerTag
+ )
+ M.empty
+ (probabilityMap allDocs)
printTags :: Document -> IO ()
printTags doc =
mapM_
( \tag ->
- case tagValue tag of
- Nothing -> printf "@%s\n" (tagKey tag)
- Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue
+ case G.tagValue tag of
+ Nothing -> printf "@%s\n" (G.tagKey tag)
+ Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue
)
(doc.index.tags `S.union` doc.index.internalTags)
@@ -340,11 +363,20 @@ instance HasField "oFilePath" Document FilePath where
instance HasField "iFilePath" Document FilePath where
getField doc = "index" </> doc.iFileName
+hasWord :: T.Text -> Document -> Bool
+hasWord word doc = S.member word doc.index.originalWords
+
+hasTag :: G.Tag -> Document -> Bool
+hasTag tag doc = S.member tag doc.index.tags
+
+hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
+hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+
tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
tagValues docs =
M.unionsWith S.union $
mapMaybe
- ( \(Tag tagKey tagValue) ->
+ ( \(G.Tag tagKey tagValue) ->
M.singleton tagKey . S.singleton <$> tagValue
)
(S.toList (S.unions (map (.index.tags) docs)))
@@ -364,7 +396,7 @@ applyFilters filters = filter (pred filters) `at` (.index.internalTags)
where
pred1 (Filter Include filter') = pred1' filter'
pred1 (Filter Exclude filter') = not . pred1' filter'
- pred1' (FilterByTag tagKey) = hasTag (Tag tagKey Nothing)
+ pred1' (FilterByTag tagKey) = G.hasTag (G.Tag tagKey Nothing)
pred filters = \index -> all ($ index) (map pred1 filters)
at :: ([a] -> [a]) -> (b -> a) -> [b] -> [b]
@@ -445,7 +477,35 @@ editDocuments docs =
)
docs
-suggestTags :: S.Settings -> [Document] -> Document -> IO [(Tag, [Tag])]
+type ProbabilityMap = [(T.Text, G.Tag, Double)]
+
+-- TODO Consider words that contribute to NOT adding a tag
+--
+-- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well.
+--
+-- @topic probability-map
+probabilityMap :: [Document] -> ProbabilityMap
+probabilityMap allDocs =
+ [ let docsWithWord = filter (hasWord word) allDocs
+ docsWithTag = filter (hasTag tag) allDocs
+ docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs
+ p =
+ fi (length docsWithWordAndTag)
+ / fi
+ ( length docsWithWord
+ + length docsWithTag
+ - length docsWithWordAndTag
+ )
+ in (word, tag, p)
+ | word <- S.toList allWords,
+ tag <- S.toList allTags
+ ]
+ where
+ allTags = foldl S.union S.empty (map (.index.tags) allDocs)
+ allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
+ fi = fromIntegral @Int @Double
+
+suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])]
suggestTags settings allDocs doc = do
forM settings.suggestedTags $ \suggestedTag -> do
case suggestedTag of
@@ -454,41 +514,20 @@ suggestTags settings allDocs doc = do
nub . catMaybes . map R.matchedText . R.allMatches $
doc.index.originalText
R.*=~ pattern
- pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues)
+ pure (G.Tag tagName (Just ""), map (G.Tag tagName . Just) tagValues)
S.SuggestTagByTags tagName -> do
- let allTags = foldl S.union S.empty (map (.index.tags) allDocs)
- allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
-
- hasWord word doc = S.member word doc.index.originalWords
- hasTag tag doc = S.member tag doc.index.tags
- hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
- fi = fromIntegral @Int @Double
-
- -- TODO Consider words that contribute to NOT adding a tag
- --
- -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well.
- --
- -- @topic probability-map
-
- -- TODO Cache `probabilityMap`
- probabilityMap =
- [ let docs = filter (hasWordAndTag word tag) allDocs
- p = fi (length docs) / fi (length allDocs)
- in (word, tag, p)
- | word <- S.toList allWords,
- tag <- S.toList allTags
- ]
let tagValues =
- probabilityMap
+ -- TODO Cache `probabilityMap`
+ probabilityMap allDocs
& filter
( \(word, tag, _) ->
- tagKey tag == tagName && hasWord word doc
+ G.tagKey tag == tagName && hasWord word doc
)
& foldl'
( \scorePerTagValue (_, tag, p) ->
M.insertWith
(+)
- (tagValue tag)
+ (G.tagValue tag)
p
scorePerTagValue
)
@@ -496,9 +535,9 @@ suggestTags settings allDocs doc = do
& M.toList
& sortBy (comparing (negate . snd))
& map fst
- pure (Tag tagName (Just ""), map (Tag tagName) tagValues)
+ pure (G.Tag tagName (Just ""), map (G.Tag tagName) tagValues)
-autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag]
+autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag]
autoApplySuggestedTags =
mapMaybe
( \(_, suggestedTags) ->
@@ -540,7 +579,7 @@ tagDocumentInteractively settings allDocs doc = do
doc.index
{ tags =
S.filter
- (not . (`elem` tagsToRemove) . tagKey)
+ (not . (`elem` tagsToRemove) . G.tagKey)
doc.index.tags
`S.union` (S.fromList tagsToAdd),
todo = False
@@ -551,22 +590,22 @@ tagDocumentInteractively settings allDocs doc = do
commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath)
pure doc'
where
- tagDocumentInteractively' :: Tag -> [Tag] -> IO (Either T.Text Tag)
- tagDocumentInteractively' tag@(Tag tagKey Nothing) _ = do
+ tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag)
+ tagDocumentInteractively' tag@(G.Tag tagKey Nothing) _ = do
choice <-
P.prompt $
P.choice (printf "tag with %s?" tagKey) (("n" :: String) N.:| ["y"])
pure $ if (choice == "y") then Right tag else Left tagKey
- tagDocumentInteractively' (Tag tagKey (Just _)) tags = do
+ tagDocumentInteractively' (G.Tag tagKey (Just _)) tags = do
tagValue <-
fmap T.pack . P.prompt $
P.string
(printf "tag with %s?" tagKey)
- (mapMaybe (fmap T.unpack . tagValue) tags ++ ["-"])
+ (mapMaybe (fmap T.unpack . G.tagValue) tags ++ ["-"])
pure $
if tagValue == "-"
then Left tagKey
- else Right (Tag tagKey (Just tagValue))
+ else Right (G.Tag tagKey (Just tagValue))
ensureGit :: IO ()
ensureGit = do
@@ -666,7 +705,7 @@ ocr1 language tmp input =
data Index = Index
{ originalText :: T.Text,
- tags :: S.Set Tag,
+ tags :: S.Set G.Tag,
addedAt :: UTCTime,
todo :: Bool,
language :: String
@@ -682,7 +721,7 @@ instance HasField "shortText" Index T.Text where
(T.unlines . take 10 . T.lines)
. (.originalText)
-instance HasField "internalTags" Index (S.Set Tag) where
+instance HasField "internalTags" Index (S.Set G.Tag) where
getField index =
index.tags
`S.union` internalTags index
@@ -694,14 +733,14 @@ instance HasField "originalWords" Index (S.Set T.Text) where
getField index =
S.fromList (T.words index.originalText)
-internalTags :: Index -> S.Set Tag
+internalTags :: Index -> S.Set G.Tag
internalTags index =
S.fromList
( concat
- [ [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
- Tag "language" (Just (T.pack index.language))
+ [ [ G.Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
+ G.Tag "language" (Just (T.pack index.language))
],
- if index.todo then [Tag "todo" Nothing] else []
+ if index.todo then [G.Tag "todo" Nothing] else []
]
)