diff options
-rw-r--r-- | apaperless.yaml | 6 | ||||
-rw-r--r-- | app/Main.hs | 70 | ||||
-rw-r--r-- | app/Settings.hs | 4 |
3 files changed, 69 insertions, 11 deletions
diff --git a/apaperless.yaml b/apaperless.yaml index de0da43..f6e63a7 100644 --- a/apaperless.yaml +++ b/apaperless.yaml @@ -1,3 +1,7 @@ suggestedTags: -- - createdAt +- contents: + - createdAt - '[0-9]{2}\.[0-9]{2}\.[0-9]{4}' + tag: SuggestTagByRE +- contents: correspondent + tag: SuggestTagByTags diff --git a/app/Main.hs b/app/Main.hs index adb16b4..096f253 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} +{-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} @@ -25,6 +26,7 @@ import Data.List import Data.List.NonEmpty qualified as N import Data.Map qualified as M import Data.Maybe (catMaybes, fromMaybe, mapMaybe) +import Data.Ord (comparing) import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T @@ -291,7 +293,7 @@ printTags doc = Nothing -> printf "@%s\n" (tagKey tag) Just tagValue -> printf "@%s %s\n" (tagKey tag) tagValue ) - doc.index.tags + (doc.index.tags `S.union` doc.index.internalTags) doRedoIf :: [Filter] -> Bool -> IO () doRedoIf filters redo = @@ -434,6 +436,45 @@ suggestTags settings allDocs doc = do doc.index.originalText R.*=~ pattern pure (Tag tagName (Just ""), map (Tag tagName . Just) tagValues) + S.SuggestTagByTags tagName -> do + let allTags = foldl S.union S.empty (map (.index.tags) allDocs) + allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) + + hasWord word doc = S.member word doc.index.originalWords + hasTag tag doc = S.member tag doc.index.tags + hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc + fi = fromIntegral + + -- TODO Consider words that contribute to NOT adding a tag + -- + -- If there is a document that should, say, not have a `@correspondant`, we should score words that contribute to that fact as well. + -- + -- @topic probability-map + + -- TODO Cache `probabilityMap` + probabilityMap = + [ let docs = filter (hasWordAndTag word tag) allDocs + p = fi (length docs) / fi (length allDocs) + in (word, tag, p) + | word <- S.toList allWords, + tag <- S.toList allTags + ] + let tagValues = + probabilityMap + & filter (\(word, tag, _) -> hasWordAndTag word tag doc) + & foldl' + ( \scorePerTagValue (_, tag, p) -> + M.insertWith + (+) + (tagValue tag) + p + scorePerTagValue + ) + M.empty + & M.toList + & sortBy (comparing (negate . snd)) + & map fst + pure (Tag tagName (Just ""), map (Tag tagName) tagValues) autoApplySuggestedTags :: [(Tag, [Tag])] -> [Tag] autoApplySuggestedTags = @@ -456,8 +497,8 @@ processDocuments settings allDocs docs = doc { index = doc.index - { tags = - S.delete (Tag "todo" Nothing) (doc.index.tags `S.union` tags) + { tags = doc.index.tags `S.union` tags, + todo = False } } withGit do @@ -475,8 +516,8 @@ tagDocumentInteractively settings allDocs doc = do doc { index = doc.index - { tags = - S.delete (Tag "todo" Nothing) (doc.index.tags `S.union` tags) + { tags = doc.index.tags `S.union` tags, + todo = False } } withGit do @@ -536,7 +577,7 @@ consume1 force keep filePath = do else pure originalText' addedAt <- getCurrentTime withGit do - J.encodeFile iFilePath Index {tags = S.singleton (Tag "todo" Nothing), ..} + J.encodeFile iFilePath Index {tags = S.empty, todo = True, ..} if keep then copyFile filePath oFilePath else renameFile filePath oFilePath @@ -600,7 +641,8 @@ ocr1 tmp input = data Index = Index { originalText :: T.Text, tags :: S.Set Tag, - addedAt :: UTCTime + addedAt :: UTCTime, + todo :: Bool } deriving (Show, Generic, Eq) @@ -618,11 +660,21 @@ instance HasField "internalTags" Index (S.Set Tag) where index.tags `S.union` internalTags index +-- TODO Cache `originalWords` +-- +-- @related cache-probabilitymap +instance HasField "originalWords" Index (S.Set T.Text) where + getField index = + S.fromList (T.words index.originalText) + internalTags :: Index -> S.Set Tag internalTags index = S.fromList - [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))) - ] + ( concat + [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))], + if index.todo then [Tag "todo" Nothing] else [] + ] + ) data PdfInfo = PdfInfo { numPages :: Int, diff --git a/app/Settings.hs b/app/Settings.hs index 5d4d55f..39e8e35 100644 --- a/app/Settings.hs +++ b/app/Settings.hs @@ -43,7 +43,8 @@ instance Monoid Settings where -- -- - date formats that spell out the month name, ie. `1 Januar 1970` or `1 Jan 1970` -- - perform minor corrections on OCR, ie. parse `0 1.01.1970` - SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|] + SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|], + SuggestTagByTags "correspondent" ] } @@ -53,6 +54,7 @@ instance A.ToJSON Settings data SuggestedTag = SuggestTagByRE T.Text R.RE + | SuggestTagByTags T.Text deriving (Show, Generic, Eq) instance Show R.RE where |