summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-26 05:39:29 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-26 05:39:30 +0100
commit4089b57dda84ce907046c7d47c44f75711310e23 (patch)
treef00edf04967cdc1dc188f4f35dcc54ca0a4c544c
parent97d2eb5a52d546db79ac687479f2d687de8445c1 (diff)
chore: add `SuggestTagByTags`
This naive implementation scores tags based on the words that correlate to a tag being set. It then uses that score to determine the highest-scoring value for a tag based on the words in a document.
-rw-r--r--apaperless.yaml6
-rw-r--r--app/Main.hs70
-rw-r--r--app/Settings.hs4
3 files changed, 69 insertions, 11 deletions
diff --git a/apaperless.yaml b/apaperless.yaml
index de0da43..f6e63a7 100644
--- a/apaperless.yaml
+++ b/apaperless.yaml
@@ -1,3 +1,7 @@
suggestedTags:
-- - createdAt
+- contents:
+ - createdAt
- '[0-9]{2}\.[0-9]{2}\.[0-9]{4}'
+ tag: SuggestTagByRE
+- contents: correspondent
+ tag: SuggestTagByTags
diff --git a/app/Main.hs b/app/Main.hs
index adb16b4..096f253 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
@@ -25,6 +26,7 @@ import Data.List
import Data.List.NonEmpty qualified as N
import Data.Map qualified as M
import Data.Maybe (catMaybes, fromMaybe, mapMaybe)
+import Data.Ord (comparing)
import Data.Set qualified as S
import Data.String (IsString (fromString))
import Data.Text qualified as T
@@ -291,7 +293,7 @@ printTags doc =
Nothing -> printf "@%s\n" (tagKey tag)
Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue
)
- doc.index.tags
+ (doc.index.tags `S.union` doc.index.internalTags)
doRedoIf :: [Filter] -> Bool -> IO ()
doRedoIf filters redo =
@@ -434,6 +436,45 @@ suggestTags settings allDocs doc = do
doc.index.originalText
R.*=~ pattern
pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues)
+ S.SuggestTagByTags tagName -> do
+ let allTags = foldl S.union S.empty (map (.index.tags) allDocs)
+ allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
+
+ hasWord word doc = S.member word doc.index.originalWords
+ hasTag tag doc = S.member tag doc.index.tags
+ hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+ fi = fromIntegral
+
+ -- TODO Consider words that contribute to NOT adding a tag
+ --
+ -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well.
+ --
+ -- @topic probability-map
+
+ -- TODO Cache `probabilityMap`
+ probabilityMap =
+ [ let docs = filter (hasWordAndTag word tag) allDocs
+ p = fi (length docs) / fi (length allDocs)
+ in (word, tag, p)
+ | word <- S.toList allWords,
+ tag <- S.toList allTags
+ ]
+ let tagValues =
+ probabilityMap
+ & filter (\(word, tag, _) -> hasWordAndTag word tag doc)
+ & foldl'
+ ( \scorePerTagValue (_, tag, p) ->
+ M.insertWith
+ (+)
+ (tagValue tag)
+ p
+ scorePerTagValue
+ )
+ M.empty
+ & M.toList
+ & sortBy (comparing (negate . snd))
+ & map fst
+ pure (Tag tagName (Just ""), map (Tag tagName) tagValues)
autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag]
autoApplySuggestedTags =
@@ -456,8 +497,8 @@ processDocuments settings allDocs docs =
doc
{ index =
doc.index
- { tags =
- S.delete (Tag "todo" Nothing) (doc.index.tags `S.union` tags)
+ { tags = doc.index.tags `S.union` tags,
+ todo = False
}
}
withGit do
@@ -475,8 +516,8 @@ tagDocumentInteractively settings allDocs doc = do
doc
{ index =
doc.index
- { tags =
- S.delete (Tag "todo" Nothing) (doc.index.tags `S.union` tags)
+ { tags = doc.index.tags `S.union` tags,
+ todo = False
}
}
withGit do
@@ -536,7 +577,7 @@ consume1 force keep filePath = do
else pure originalText'
addedAt <- getCurrentTime
withGit do
- J.encodeFile iFilePath Index {tags = S.singleton (Tag "todo" Nothing), ..}
+ J.encodeFile iFilePath Index {tags = S.empty, todo = True, ..}
if keep
then copyFile filePath oFilePath
else renameFile filePath oFilePath
@@ -600,7 +641,8 @@ ocr1 tmp input =
data Index = Index
{ originalText :: T.Text,
tags :: S.Set Tag,
- addedAt :: UTCTime
+ addedAt :: UTCTime,
+ todo :: Bool
}
deriving (Show, Generic, Eq)
@@ -618,11 +660,21 @@ instance HasField "internalTags" Index (S.Set Tag) where
index.tags
`S.union` internalTags index
+-- TODO Cache `originalWords`
+--
+-- @related cache-probabilitymap
+instance HasField "originalWords" Index (S.Set T.Text) where
+ getField index =
+ S.fromList (T.words index.originalText)
+
internalTags :: Index -> S.Set Tag
internalTags index =
S.fromList
- [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))
- ]
+ ( concat
+ [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))],
+ if index.todo then [Tag "todo" Nothing] else []
+ ]
+ )
data PdfInfo = PdfInfo
{ numPages :: Int,
diff --git a/app/Settings.hs b/app/Settings.hs
index 5d4d55f..39e8e35 100644
--- a/app/Settings.hs
+++ b/app/Settings.hs
@@ -43,7 +43,8 @@ instance Monoid Settings where
--
-- - date formats that spell out the month name, ie. `1 Januar 1970` or `1 Jan 1970`
-- - perform minor corrections on OCR, ie. parse `0 1.01.1970`
- SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|]
+ SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|],
+ SuggestTagByTags "correspondent"
]
}
@@ -53,6 +54,7 @@ instance A.ToJSON Settings
data SuggestedTag
= SuggestTagByRE T.Text R.RE
+ | SuggestTagByTags T.Text
deriving (Show, Generic, Eq)
instance Show R.RE where