From ddc2c9a43b035ce8eff68aa6e3c7dc2b37511ed5 Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Tue, 26 Dec 2023 07:23:16 +0100 Subject: feat: add `topwords` command Changes the heuristic for the probability map. --- app/Main.hs | 135 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 48 deletions(-) (limited to 'app') diff --git a/app/Main.hs b/app/Main.hs index 5eb661d..47eeedd 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -48,7 +48,7 @@ import System.FilePath import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed -import Tag +import Tag qualified as G import Text.Printf (printf) import Text.RE.TDFA.Text qualified as R import Text.Read (readMaybe) @@ -79,6 +79,7 @@ data Cmd | View { indexNames :: [FilePath] } + | TopWords args :: O.Parser Args args = @@ -96,7 +97,9 @@ cmd = O.command "todo" . O.info todoCmd $ O.progDesc "Interactively process new documents", O.command "view" . O.info viewCmd $ - O.progDesc "View document(s)" + O.progDesc "View document(s)", + O.command "topwords" . O.info topWordsCmd $ + O.progDesc "View probability map per tag" ] consumeCmd :: O.Parser Cmd @@ -131,6 +134,10 @@ viewCmd = View <$> indexNamesArg +topWordsCmd :: O.Parser Cmd +topWordsCmd = + pure TopWords + inputsArg :: O.Parser [FilePath] inputsArg = O.many @@ -304,14 +311,30 @@ main = do Args {cmd = View {indexNames}} -> do viewDocuments =<< mapM (readDocument . (<.> "json")) indexNames + Args {cmd = TopWords} -> do + allDocs <- getDocuments + mapM_ + ( \(tag, xs) -> do + print tag + mapM_ (\(word, p) -> printf " %s: %.4f\n" word p) xs + ) + $ map (second (sortBy (comparing (negate . snd)))) + $ map (second (filter ((> 0) . snd))) + $ M.toList + $ foldl + ( \wordProbabilityPerTag (word, tag, p) -> + M.insertWith (++) tag [(word, p)] wordProbabilityPerTag + ) + M.empty + (probabilityMap allDocs) printTags :: Document -> IO () printTags doc = mapM_ ( \tag -> - case tagValue tag of - Nothing -> printf "@%s\n" (tagKey tag) - Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue + case G.tagValue tag of + Nothing -> printf "@%s\n" (G.tagKey tag) + Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue ) (doc.index.tags `S.union` doc.index.internalTags) @@ -340,11 +363,20 @@ instance HasField "oFilePath" Document FilePath where instance HasField "iFilePath" Document FilePath where getField doc = "index" doc.iFileName +hasWord :: T.Text -> Document -> Bool +hasWord word doc = S.member word doc.index.originalWords + +hasTag :: G.Tag -> Document -> Bool +hasTag tag doc = S.member tag doc.index.tags + +hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool +hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc + tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) tagValues docs = M.unionsWith S.union $ mapMaybe - ( \(Tag tagKey tagValue) -> + ( \(G.Tag tagKey tagValue) -> M.singleton tagKey . S.singleton <$> tagValue ) (S.toList (S.unions (map (.index.tags) docs))) @@ -364,7 +396,7 @@ applyFilters filters = filter (pred filters) `at` (.index.internalTags) where pred1 (Filter Include filter') = pred1' filter' pred1 (Filter Exclude filter') = not . pred1' filter' - pred1' (FilterByTag tagKey) = hasTag (Tag tagKey Nothing) + pred1' (FilterByTag tagKey) = G.hasTag (G.Tag tagKey Nothing) pred filters = \index -> all ($ index) (map pred1 filters) at :: ([a] -> [a]) -> (b -> a) -> [b] -> [b] @@ -445,7 +477,35 @@ editDocuments docs = ) docs -suggestTags :: S.Settings -> [Document] -> Document -> IO [(Tag, [Tag])] +type ProbabilityMap = [(T.Text, G.Tag, Double)] + +-- TODO Consider words that contribute to NOT adding a tag +-- +-- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well. +-- +-- @topic probability-map +probabilityMap :: [Document] -> ProbabilityMap +probabilityMap allDocs = + [ let docsWithWord = filter (hasWord word) allDocs + docsWithTag = filter (hasTag tag) allDocs + docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs + p = + fi (length docsWithWordAndTag) + / fi + ( length docsWithWord + + length docsWithTag + - length docsWithWordAndTag + ) + in (word, tag, p) + | word <- S.toList allWords, + tag <- S.toList allTags + ] + where + allTags = foldl S.union S.empty (map (.index.tags) allDocs) + allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) + fi = fromIntegral @Int @Double + +suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])] suggestTags settings allDocs doc = do forM settings.suggestedTags $ \suggestedTag -> do case suggestedTag of @@ -454,41 +514,20 @@ suggestTags settings allDocs doc = do nub . catMaybes . map R.matchedText . R.allMatches $ doc.index.originalText R.*=~ pattern - pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues) + pure (G.Tag tagName (Just ""), map (G.Tag tagName . Just) tagValues) S.SuggestTagByTags tagName -> do - let allTags = foldl S.union S.empty (map (.index.tags) allDocs) - allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) - - hasWord word doc = S.member word doc.index.originalWords - hasTag tag doc = S.member tag doc.index.tags - hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc - fi = fromIntegral @Int @Double - - -- TODO Consider words that contribute to NOT adding a tag - -- - -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well. - -- - -- @topic probability-map - - -- TODO Cache `probabilityMap` - probabilityMap = - [ let docs = filter (hasWordAndTag word tag) allDocs - p = fi (length docs) / fi (length allDocs) - in (word, tag, p) - | word <- S.toList allWords, - tag <- S.toList allTags - ] let tagValues = - probabilityMap + -- TODO Cache `probabilityMap` + probabilityMap allDocs & filter ( \(word, tag, _) -> - tagKey tag == tagName && hasWord word doc + G.tagKey tag == tagName && hasWord word doc ) & foldl' ( \scorePerTagValue (_, tag, p) -> M.insertWith (+) - (tagValue tag) + (G.tagValue tag) p scorePerTagValue ) @@ -496,9 +535,9 @@ suggestTags settings allDocs doc = do & M.toList & sortBy (comparing (negate . snd)) & map fst - pure (Tag tagName (Just ""), map (Tag tagName) tagValues) + pure (G.Tag tagName (Just ""), map (G.Tag tagName) tagValues) -autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag] +autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag] autoApplySuggestedTags = mapMaybe ( \(_, suggestedTags) -> @@ -540,7 +579,7 @@ tagDocumentInteractively settings allDocs doc = do doc.index { tags = S.filter - (not . (`elem` tagsToRemove) . tagKey) + (not . (`elem` tagsToRemove) . G.tagKey) doc.index.tags `S.union` (S.fromList tagsToAdd), todo = False @@ -551,22 +590,22 @@ tagDocumentInteractively settings allDocs doc = do commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath) pure doc' where - tagDocumentInteractively' :: Tag -> [Tag] -> IO (Either T.Text Tag) - tagDocumentInteractively' tag@(Tag tagKey Nothing) _ = do + tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag) + tagDocumentInteractively' tag@(G.Tag tagKey Nothing) _ = do choice <- P.prompt $ P.choice (printf "tag with %s?" tagKey) (("n" :: String) N.:| ["y"]) pure $ if (choice == "y") then Right tag else Left tagKey - tagDocumentInteractively' (Tag tagKey (Just _)) tags = do + tagDocumentInteractively' (G.Tag tagKey (Just _)) tags = do tagValue <- fmap T.pack . P.prompt $ P.string (printf "tag with %s?" tagKey) - (mapMaybe (fmap T.unpack . tagValue) tags ++ ["-"]) + (mapMaybe (fmap T.unpack . G.tagValue) tags ++ ["-"]) pure $ if tagValue == "-" then Left tagKey - else Right (Tag tagKey (Just tagValue)) + else Right (G.Tag tagKey (Just tagValue)) ensureGit :: IO () ensureGit = do @@ -666,7 +705,7 @@ ocr1 language tmp input = data Index = Index { originalText :: T.Text, - tags :: S.Set Tag, + tags :: S.Set G.Tag, addedAt :: UTCTime, todo :: Bool, language :: String @@ -682,7 +721,7 @@ instance HasField "shortText" Index T.Text where (T.unlines . take 10 . T.lines) . (.originalText) -instance HasField "internalTags" Index (S.Set Tag) where +instance HasField "internalTags" Index (S.Set G.Tag) where getField index = index.tags `S.union` internalTags index @@ -694,14 +733,14 @@ instance HasField "originalWords" Index (S.Set T.Text) where getField index = S.fromList (T.words index.originalText) -internalTags :: Index -> S.Set Tag +internalTags :: Index -> S.Set G.Tag internalTags index = S.fromList ( concat - [ [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), - Tag "language" (Just (T.pack index.language)) + [ [ G.Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), + G.Tag "language" (Just (T.pack index.language)) ], - if index.todo then [Tag "todo" Nothing] else [] + if index.todo then [G.Tag "todo" Nothing] else [] ] ) -- cgit v1.2.3