summaryrefslogtreecommitdiffstats
path: root/app/Main.hs
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2024-01-11 03:20:56 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2024-01-11 03:26:52 +0100
commit7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (patch)
tree3ee48fc98f98ab7ac7ad19e24334e07b8b147dd6 /app/Main.hs
parent673c59d9be8b62106ffbba96d805680f0b5e7e3f (diff)
chore: make computing `probabilityMap` more performance
Diffstat (limited to 'app/Main.hs')
-rw-r--r--app/Main.hs371
1 files changed, 131 insertions, 240 deletions
diff --git a/app/Main.hs b/app/Main.hs
index 93fed75..936f053 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
@@ -16,41 +17,42 @@ module Main where
import Control.Arrow (first, second)
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throw, throwIO)
-import Control.Monad (forM, unless, when)
+import Control.Monad (unless, when)
import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
-import Data.Default
import Data.Digest.Pure.SHA (sha1, showDigest)
import Data.Either (partitionEithers)
import Data.Function ((&))
+import Data.HashMap.Internal qualified as HM
import Data.List
import Data.List.NonEmpty qualified as N
import Data.Map qualified as M
-import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList)
+import Data.Maybe (fromMaybe, mapMaybe)
import Data.Ord (comparing)
import Data.Set qualified as S
import Data.String (IsString (fromString))
import Data.Text qualified as T
import Data.Text.Encoding qualified as T
import Data.Text.IO qualified as T
-import Data.Time.Clock (UTCTime, getCurrentTime)
-import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show)
+import Data.Time.Clock (getCurrentTime)
+import Data.Time.Format.ISO8601 (iso8601ParseM)
import Debug.Trace
+import Document qualified as D
import GHC.Conc (getNumProcessors)
-import GHC.Generics (Generic)
-import GHC.Records (HasField (..))
import Options.Applicative qualified as O
+import ProbabilityMap qualified as C
import Prompt qualified as P
import Settings qualified as S
+import Store qualified as R
import System.Directory
import System.Environment (getEnv)
import System.FilePath
-import System.IO.LockFile (withLockFile)
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
import Tag qualified as G
import Text.Printf (printf)
+import Text.RE.Replace qualified as R
import Text.RE.TDFA.Text qualified as R
import Text.Read (readMaybe)
@@ -291,13 +293,13 @@ main = do
keep
)
(map (cwd </>) inputs)
- docs <- mapM (readDocument . (<.> "json")) indexNames
- allDocs <- getDocuments
+ docs <- mapM (D.readDocument . (<.> "json")) indexNames
+ probabilityCache <- C.readProbabilityCache
docs' <-
if
- | auto -> processDocuments settings allDocs docs
+ | auto -> processDocuments settings probabilityCache docs
-- TODO adding tags interactively through prompt does not persist them in store
- | prompt -> processDocumentsInteractively settings allDocs docs
+ | prompt -> processDocumentsInteractively settings probabilityCache docs
| otherwise -> pure docs
mapM_
( \doc -> do
@@ -307,7 +309,7 @@ main = do
docs'
Args {cmd = Edit {indexNames}} -> do
editDocuments
- =<< mapM (readDocument . (<.> "json")) indexNames
+ =<< mapM (D.readDocument . (<.> "json")) indexNames
Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do
doRedoIf filters redo
mapM_
@@ -316,24 +318,26 @@ main = do
printTags doc
)
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
Args {cmd = List {filters, redo, edit = True}} -> do
doRedoIf filters redo
editDocuments
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
Args {cmd = List {filters, redo, todo = True}} -> do
doRedoIf filters redo
- allDocs <- getDocuments
+ allDocs <- D.getDocuments
+ probabilityCache <- C.readProbabilityCache
_ <-
- processDocumentsInteractively settings allDocs
+ processDocumentsInteractively settings probabilityCache
. filter (G.applyFilters filters . (.index.tags))
$ allDocs
pure ()
Args {cmd = Todo} -> do
- allDocs <- getDocuments
+ allDocs <- D.getDocuments
+ probabilityCache <- C.readProbabilityCache
_ <-
- processDocumentsInteractively settings allDocs
+ processDocumentsInteractively settings probabilityCache
. filter (G.applyFilters [G.filter G.include "todo" Nothing] . (.index.tags))
$ allDocs
pure ()
@@ -341,12 +345,12 @@ main = do
doRedoIf filters redo
viewDocuments
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
Args {cmd = View {indexNames}} -> do
viewDocuments
- =<< mapM (readDocument . (<.> "json")) indexNames
+ =<< mapM (D.readDocument . (<.> "json")) indexNames
Args {cmd = TopWords} -> do
- allDocs <- getDocuments
+ probabilityCache <- C.readProbabilityCache
mapM_
( \(tag, xs) -> do
print tag
@@ -355,24 +359,23 @@ main = do
$ map (second (sortBy (comparing (negate . snd))))
$ map (second (filter ((> 0) . snd)))
$ M.toList
- $ foldl
- ( \wordProbabilityPerTag (word, tag, p) ->
+ $ M.foldlWithKey
+ ( \wordProbabilityPerTag (word, tag) p ->
M.insertWith (++) tag [(word, p)] wordProbabilityPerTag
)
M.empty
- (probabilityMap allDocs)
+ probabilityCache.probabilityMap
Args {cmd = Modify {indexNames, tags, untags}} -> do
- docs <- mapM (readDocument . (<.> "json")) indexNames
+ docs <- mapM (D.readDocument . (<.> "json")) indexNames
mapM_
- ( \doc -> do
- let doc' = addTags tags (removeTags untags doc)
- withGit do
- J.encodeFile doc'.iFilePath doc'.index
- commitAll [doc.iFilePath] (printf "tag %s" (takeBaseName doc.iFilePath))
+ ( \doc ->
+ R.replaceDocument
+ (printf "tag %s" (takeBaseName doc.iFilePath))
+ doc
)
- docs
+ (map (addTags tags . removeTags untags) docs)
-printTags :: Document -> IO ()
+printTags :: D.Document -> IO ()
printTags doc =
mapM_
( \tag ->
@@ -380,76 +383,36 @@ printTags doc =
Nothing -> printf "@%s\n" (G.tagKey tag)
Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue
)
- ( filter
- ( \tag ->
- -- TODO Handle tags hidden by default
- --
- -- - Add `list-default-hide-tags` to `Settings`
- -- - Add `--show-tags`, `--hide-tags` to `List`
- not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag)
- && (G.tagKey tag /= "content")
- && (G.tagKey tag /= "importedFrom")
- && (G.tagKey tag /= "language")
- )
+ ( filter arbitrarilySelectTag
. S.toList
$ doc.index.tags `S.union` doc.index.internalTags
)
+arbitrarilySelectTag :: G.Tag -> Bool
+arbitrarilySelectTag tag =
+ -- TODO Handle tags hidden by default
+ --
+ -- - Add `list-default-hide-tags` to `Settings`
+ -- - Add `--show-tags`, `--hide-tags` to `List`
+ not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag)
+ && (G.tagKey tag /= "content")
+ && (G.tagKey tag /= "importedFrom")
+ && (G.tagKey tag /= "language")
+
doRedoIf :: [G.Filter] -> Bool -> IO ()
doRedoIf filters redo =
when redo do
parMapM_ doRedo
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
where
doRedo doc = do
content <- ocr doc.index.language doc.oFilePath
- withGit do
- J.encodeFile doc.iFilePath doc.index {content = content}
- commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath))
-
-data Document = Document
- { iFileName :: String,
- index :: Index
- }
- deriving (Show)
-
-instance HasField "oFilePath" Document FilePath where
- getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
-
-instance HasField "iFilePath" Document FilePath where
- getField doc = "index" </> doc.iFileName
-
-hasWord :: T.Text -> Document -> Bool
-hasWord word doc = S.member word doc.index.originalWords
-
-hasTag :: G.Tag -> Document -> Bool
-hasTag tag doc = S.member tag doc.index.tags
-
-hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
-hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
-
-tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
-tagValues docs =
- M.unionsWith S.union $
- mapMaybe
- ( \tag ->
- M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
- )
- (S.toList (S.unions (map (.index.tags) docs)))
+ let doc' = doc {D.index = doc.index {D.content = content}}
+ R.replaceDocument (printf "redo %s" (takeBaseName doc.iFilePath)) doc'
-getDocuments :: IO [Document]
-getDocuments =
- parMapM readDocument
- =<< sort <$> listDirectory "index"
-
-readDocument :: FilePath -> IO Document
-readDocument iFileName =
- Document iFileName
- <$> decodeFile @Index ("index" </> iFileName)
-
-processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document]
-processDocumentsInteractively settings allDocs docs =
+processDocumentsInteractively :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document]
+processDocumentsInteractively settings probabilityCache docs = do
mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs)
where
numDocs = length docs
@@ -478,13 +441,13 @@ processDocumentsInteractively settings allDocs docs =
printf "%s\n" (takeBaseName doc.iFileName)
printf "%s\n" doc.index.content
processDocumentInteractively n doc
- "p" -> tagDocumentInteractively settings allDocs doc
+ "p" -> tagDocumentInteractively settings probabilityCache doc
"s" -> pure doc
"v" -> do
viewDocuments [doc]
processDocumentInteractively n doc
-viewDocuments :: [Document] -> IO ()
+viewDocuments :: [D.Document] -> IO ()
viewDocuments docs =
sh_
( "zathura "
@@ -494,7 +457,7 @@ viewDocuments docs =
)
)
-editDocuments :: [Document] -> IO ()
+editDocuments :: [D.Document] -> IO ()
editDocuments docs =
withSystemTempDirectory "apaperless" $ \tmp -> do
let fp doc = tmp </> takeBaseName doc.iFileName <.> "txt"
@@ -513,63 +476,49 @@ editDocuments docs =
mapM_
( \doc -> do
content <- T.readFile (fp doc)
- withGit do
- J.encodeFile doc.iFilePath doc.index {content = content}
- commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath))
+ let doc' = doc {D.index = doc.index {D.content = content}}
+ R.replaceDocument (printf "edit %s" (takeBaseName doc.iFilePath)) doc'
pure content
)
docs
-type ProbabilityMap = [(T.Text, G.Tag, Double)]
-
--- TODO Consider words that contribute to NOT adding a tag
---
--- If there is a document that should, say, not have a `@correspondent`, we should score words that contribute to that fact as well.
---
--- @topic probability-map
-probabilityMap :: [Document] -> ProbabilityMap
-probabilityMap allDocs =
- [ let docsWithWord = filter (hasWord word) allDocs
- docsWithTag = filter (hasTag tag) allDocs
- docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs
- p =
- fi (length docsWithWordAndTag)
- / fi
- ( length docsWithWord
- + length docsWithTag
- - length docsWithWordAndTag
- )
- in (word, tag, p)
- | word <- S.toList allWords,
- tag <- S.toList allTags
- ]
- where
- allTags = foldl S.union S.empty (map (.index.tags) allDocs)
- allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
- fi = fromIntegral @Int @Double
-
-suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])]
-suggestTags settings allDocs doc = do
- forM settings.suggestedTags $ \suggestedTag -> do
+suggestTags :: S.Settings -> C.ProbabilityCache -> D.Document -> [(G.Tag, [G.Tag])]
+suggestTags settings probabilityCache doc = do
+ flip map settings.suggestedTags $ \suggestedTag ->
case suggestedTag of
- S.SuggestTagByRE tagName pattern -> do
+ S.SuggestTagByRE tagName pattern ->
let tagValues =
- nub . catMaybes . map R.matchedText . R.allMatches $
- doc.index.content
- R.*=~ pattern
- pure (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues)
- S.SuggestTagByTags tagName -> do
+ nub
+ . mapMaybe
+ ( -- XXX Whys is this so complicated?
+ \match -> do
+ captures <- snd <$> R.matchCaptures match
+ pure
+ ( foldl
+ ( \template (captureName, captureOrdinal) ->
+ T.replace
+ ("${" <> R.getCaptureName captureName <> "}")
+ (R.capturedText (captures !! (fromEnum captureOrdinal - 1)))
+ template
+ )
+ (R.getTemplate pattern)
+ (HM.toList $ R.captureNames match)
+ )
+ )
+ $ R.allMatches (doc.index.content R.*=~ (R.getSearch pattern))
+ in (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues)
+ S.SuggestTagByTags tagName ->
let tagValues =
- -- TODO Cache `probabilityMap`
+ -- TODO Cache `probabilityCache`
--
-- @topic probability-map
- probabilityMap allDocs
- & filter
- ( \(word, tag, _) ->
- G.tagKey tag == tagName && hasWord word doc
+ probabilityCache.probabilityMap
+ & M.filterWithKey
+ ( \(word, tag) _ ->
+ G.tagKey tag == tagName && D.hasWord word doc
)
- & foldl'
- ( \scorePerTagValue (_, tag, p) ->
+ & M.foldlWithKey'
+ ( \scorePerTagValue (_, tag) p ->
M.insertWith
(+)
(G.tagValue tag)
@@ -580,7 +529,7 @@ suggestTags settings allDocs doc = do
& M.toList
& sortBy (comparing (negate . snd))
& map fst
- pure (G.tag tagName (Just ""), map (G.tag tagName) tagValues)
+ in (G.tag tagName (Just ""), map (G.tag tagName) tagValues)
autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag]
autoApplySuggestedTags =
@@ -591,28 +540,26 @@ autoApplySuggestedTags =
else Just (head suggestedTags)
)
-processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document]
-processDocuments settings allDocs docs =
+processDocuments :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document]
+processDocuments settings probabilityCache docs =
mapM processDocument docs
where
processDocument doc = do
- tags <-
- S.fromList . autoApplySuggestedTags
- <$> suggestTags settings allDocs doc
+ let tags =
+ S.fromList . autoApplySuggestedTags $
+ suggestTags settings probabilityCache doc
let doc' =
doc
- { index =
+ { D.index =
doc.index
- { tags = doc.index.tags `S.union` tags,
- todo = False
+ { D.tags = doc.index.tags `S.union` tags,
+ D.todo = False
}
}
- withGit do
- J.encodeFile doc.iFilePath doc'.index
- commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath)
+ R.replaceDocument (printf "process %s (auto)" doc.iFilePath) doc'
pure doc'
-applyTags :: [Either T.Text G.Tag] -> Document -> Document
+applyTags :: [Either T.Text G.Tag] -> D.Document -> D.Document
applyTags tags' doc = do
addTags tags (removeTags untags doc)
where
@@ -625,43 +572,43 @@ data AddTagException = AddTagException G.Tag
instance Exception AddTagException
-addTags :: [G.Tag] -> Document -> Document
+addTags :: [G.Tag] -> D.Document -> D.Document
addTags =
flip (foldl (flip addTag))
-addTag :: G.Tag -> Document -> Document
+addTag :: G.Tag -> D.Document -> D.Document
addTag tag doc =
if
| G.tagKey tag == "todo" ->
- doc {index = doc.index {todo = True}}
+ doc {D.index = doc.index {D.todo = True}}
| G.tagKey tag == "addedAt" ->
maybe
(throw (AddTagException tag))
( \addedAt ->
- doc {index = doc.index {addedAt = addedAt}}
+ doc {D.index = doc.index {D.addedAt = addedAt}}
)
(iso8601ParseM . T.unpack =<< (G.tagValue tag))
| G.tagKey tag == "modifiedAt" ->
maybe
(throw (AddTagException tag))
( \modifiedAt ->
- doc {index = doc.index {modifiedAt = Just modifiedAt}}
+ doc {D.index = doc.index {D.modifiedAt = Just modifiedAt}}
)
(iso8601ParseM . T.unpack =<< (G.tagValue tag))
| G.tagKey tag == "content" ->
maybe
(throw (AddTagException tag))
( \content ->
- doc {index = doc.index {content = content}}
+ doc {D.index = doc.index {D.content = content}}
)
(G.tagValue tag)
| G.tagKey tag == "language" ->
throw (AddTagException tag)
| otherwise ->
doc
- { index =
+ { D.index =
doc.index
- { tags = G.replace tag doc.index.tags
+ { D.tags = G.replace tag doc.index.tags
}
}
@@ -670,15 +617,15 @@ data RemoveTagException = RemoveTagException G.Tag
instance Exception RemoveTagException
-removeTags :: [G.Tag] -> Document -> Document
+removeTags :: [G.Tag] -> D.Document -> D.Document
removeTags =
flip (foldl (flip removeTag))
-removeTag :: G.Tag -> Document -> Document
+removeTag :: G.Tag -> D.Document -> D.Document
removeTag tag doc =
if
| G.tagKey tag == "todo" ->
- doc {index = doc.index {todo = False}}
+ doc {D.index = doc.index {D.todo = False}}
| G.tagKey tag == "addedAt" ->
throw (RemoveTagException tag)
| G.tagKey tag == "modifiedAt" ->
@@ -689,9 +636,9 @@ removeTag tag doc =
throw (RemoveTagException tag)
| otherwise ->
doc
- { index =
+ { D.index =
doc.index
- { tags =
+ { D.tags =
maybe
(G.deleteAll (G.tagKey tag))
(\_ -> G.delete tag)
@@ -700,18 +647,16 @@ removeTag tag doc =
}
}
-tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document
-tagDocumentInteractively settings allDocs doc = do
- suggestedTags <- suggestTags settings allDocs doc
+tagDocumentInteractively :: S.Settings -> C.ProbabilityCache -> D.Document -> IO D.Document
+tagDocumentInteractively settings probabilityCache doc = do
+ let suggestedTags = suggestTags settings probabilityCache doc
tags <- mapM (uncurry tagDocumentInteractively') suggestedTags
let doc' =
(applyTags tags doc)
- { index = doc.index {todo = False}
+ { D.index = doc.index {D.todo = False}
}
- withGit do
- J.encodeFile doc.iFilePath doc'.index
- commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath)
- pure doc'
+ R.replaceDocument (printf "process %s (interactive)" doc.iFilePath) doc'
+ pure doc'
where
tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag)
tagDocumentInteractively' tag tags
@@ -770,30 +715,19 @@ consume1 language force keep filePath = do
then ocr language filePath
else pure content'
addedAt <- getCurrentTime
- withGit do
- J.encodeFile
- iFilePath
- Index
- { tags = S.empty,
- todo = True,
- modifiedAt = Nothing,
- ..
- }
- if keep
- then copyFile filePath oFilePath
- else renameFile filePath oFilePath
- commitAll
- [iFilePath, oFilePath]
- (printf "add %s" (takeFileName filePath))
- pure (takeBaseName iFilePath)
-
-withGit :: IO a -> IO a
-withGit = withLockFile def ".gitlock"
+ let doc =
+ D.Document (fKey <.> "json") $
+ D.Index
+ { tags = S.empty,
+ todo = True,
+ modifiedAt = Nothing,
+ ..
+ }
+ R.commitDocument (printf "add %s" (takeFileName filePath)) filePath doc
+ when (not keep) do
+ removeFile filePath
-commitAll :: [FilePath] -> String -> IO ()
-commitAll fps m = do
- sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps))
- sh_ (printf ">/dev/null git commit -m '%s' || :" m)
+ pure (takeBaseName iFilePath)
data DecodeException = DecodeException FilePath String
deriving (Show)
@@ -839,49 +773,6 @@ ocr1 language tmp input =
-- XXX `--oem 1` seems to be unavailable
<$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language)
-data Index = Index
- { content :: T.Text,
- tags :: S.Set G.Tag,
- addedAt :: UTCTime,
- modifiedAt :: Maybe UTCTime,
- todo :: Bool,
- language :: String
- }
- deriving (Show, Generic, Eq)
-
-instance J.ToJSON Index
-
-instance J.FromJSON Index
-
-instance HasField "shortText" Index T.Text where
- getField =
- (T.unlines . take 10 . T.lines)
- . (.content)
-
-instance HasField "internalTags" Index (S.Set G.Tag) where
- getField index =
- index.tags
- `S.union` internalTags index
-
--- TODO Cache `originalWords`
---
--- @related cache-probabilitymap
-instance HasField "originalWords" Index (S.Set T.Text) where
- getField index =
- S.fromList (T.words index.content)
-
-internalTags :: Index -> S.Set G.Tag
-internalTags index =
- S.fromList
- ( concat
- [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
- G.tag "language" (Just (T.pack index.language))
- ],
- maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
- if index.todo then [G.tag "todo" Nothing] else []
- ]
- )
-
data PdfInfo = PdfInfo
{ numPages :: Int,
pageSize :: (Double, Double)