summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2024-01-11 03:20:56 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2024-01-11 03:26:52 +0100
commit7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (patch)
tree3ee48fc98f98ab7ac7ad19e24334e07b8b147dd6
parent673c59d9be8b62106ffbba96d805680f0b5e7e3f (diff)
chore: make computing `probabilityMap` more performance
-rw-r--r--apaperless.cabal14
-rw-r--r--apaperless.yaml3
-rw-r--r--app/Document.hs134
-rw-r--r--app/Main.hs371
-rw-r--r--app/ProbabilityMap.hs230
-rw-r--r--app/Settings.hs25
-rw-r--r--app/Store.hs60
-rw-r--r--tags/src/Tag.hs4
-rw-r--r--tags/tags.cabal3
9 files changed, 598 insertions, 246 deletions
diff --git a/apaperless.cabal b/apaperless.cabal
index 480b605..1217b15 100644
--- a/apaperless.cabal
+++ b/apaperless.cabal
@@ -14,14 +14,17 @@ extra-doc-files: CHANGELOG.md
-- extra-source-files:
common warnings
- ghc-options: -Wall -threaded
+ ghc-options: -Wall -threaded -with-rtsopts=-N -rtsopts
executable apaperless
import: warnings
main-is: Main.hs
other-modules:
+ Document
+ ProbabilityMap
Prompt
Settings
+ Store
-- other-extensions:
build-depends:
base,
@@ -43,6 +46,13 @@ executable apaperless
time,
regex,
yaml,
- xdg-basedir
+ xdg-basedir,
+ binary,
+ vector,
+ parallel,
+ deepseq,
+ unordered-containers,
+ vector-binary-instances,
+ split
hs-source-dirs: app
default-language: GHC2021
diff --git a/apaperless.yaml b/apaperless.yaml
index ed74742..bbc9bec 100644
--- a/apaperless.yaml
+++ b/apaperless.yaml
@@ -2,7 +2,8 @@ defaultLanguage: deu+eng
suggestedTags:
- contents:
- createdAt
- - '[0-9]{2}\.[0-9]{2}\.[0-9]{4}'
+ - - ${d}([0-9]{2})\.${m}([0-9]{2})\.${y}([0-9]{4})
+ - ${y}-${m}-${d}
tag: SuggestTagByRE
- contents: correspondent
tag: SuggestTagByTags
diff --git a/app/Document.hs b/app/Document.hs
new file mode 100644
index 0000000..ac8a73b
--- /dev/null
+++ b/app/Document.hs
@@ -0,0 +1,134 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StrictData #-}
+
+module Document
+ ( Document (..),
+ Index (..),
+ hasWord,
+ hasTag,
+ hasWordAndTag,
+ tagValues,
+ getDocuments,
+ readDocument,
+ )
+where
+
+import Control.Concurrent.ParallelIO.Local (parallel, withPool)
+import Control.Exception (Exception, throwIO)
+import Data.Aeson qualified as J
+import Data.ByteString.Lazy qualified as LB
+import Data.List (sort)
+import Data.Map qualified as M
+import Data.Maybe (mapMaybe, maybeToList)
+import Data.Set qualified as S
+import Data.Text qualified as T
+import Data.Time.Clock (UTCTime)
+import Data.Time.Format.ISO8601 (iso8601Show)
+import GHC.Conc (getNumProcessors)
+import GHC.Generics (Generic)
+import GHC.Records (HasField (..))
+import System.Directory (listDirectory)
+import System.FilePath (takeBaseName, (<.>), (</>))
+import Tag qualified as G
+
+-- TODO Inline `Index`
+
+-- TODO Replace `iFileName` with `id`
+data Document = Document
+ { iFileName :: String,
+ index :: Index
+ }
+ deriving (Show)
+
+instance HasField "oFilePath" Document FilePath where
+ getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
+
+instance HasField "iFilePath" Document FilePath where
+ getField doc = "index" </> doc.iFileName
+
+hasWord :: T.Text -> Document -> Bool
+hasWord word doc = S.member word doc.index.originalWords
+
+hasTag :: G.Tag -> Document -> Bool
+hasTag tag doc = S.member tag doc.index.tags
+
+hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
+hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+
+tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
+tagValues docs =
+ M.unionsWith S.union $
+ mapMaybe
+ ( \tag ->
+ M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
+ )
+ (S.toList (S.unions (map (.index.tags) docs)))
+
+getDocuments :: IO [Document]
+getDocuments =
+ parMapM readDocument
+ =<< sort <$> listDirectory "index"
+
+readDocument :: FilePath -> IO Document
+readDocument iFileName =
+ Document iFileName
+ <$> decodeFile @Index ("index" </> iFileName)
+
+data Index = Index
+ { content :: T.Text,
+ tags :: S.Set G.Tag,
+ addedAt :: UTCTime,
+ modifiedAt :: Maybe UTCTime,
+ todo :: Bool,
+ language :: String
+ }
+ deriving (Show, Generic, Eq)
+
+instance J.ToJSON Index
+
+instance J.FromJSON Index
+
+instance HasField "shortText" Index T.Text where
+ getField =
+ (T.unlines . take 10 . T.lines)
+ . (.content)
+
+instance HasField "internalTags" Index (S.Set G.Tag) where
+ getField index =
+ index.tags `S.union` internalTags index
+
+-- TODO Cache `originalWords`
+--
+-- @related cache-probabilitymap
+instance HasField "originalWords" Index (S.Set T.Text) where
+ getField index =
+ S.fromList (T.words index.content)
+
+internalTags :: Index -> S.Set G.Tag
+internalTags index =
+ S.fromList
+ ( concat
+ [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
+ G.tag "language" (Just (T.pack index.language))
+ ],
+ maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
+ if index.todo then [G.tag "todo" Nothing] else []
+ ]
+ )
+
+parMapM :: (a -> IO b) -> [a] -> IO [b]
+parMapM f xs = do
+ n <- getNumProcessors
+ withPool n $ \pool -> parallel pool (map f xs)
+
+data DecodeException = DecodeException FilePath String
+ deriving (Show)
+
+instance Exception DecodeException
+
+decodeFile :: J.FromJSON a => FilePath -> IO a
+decodeFile fp =
+ either (throwIO . DecodeException fp) pure . J.eitherDecode
+ =<< LB.readFile fp
diff --git a/app/Main.hs b/app/Main.hs
index 93fed75..936f053 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
@@ -16,41 +17,42 @@ module Main where
import Control.Arrow (first, second)
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throw, throwIO)
-import Control.Monad (forM, unless, when)
+import Control.Monad (unless, when)
import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
-import Data.Default
import Data.Digest.Pure.SHA (sha1, showDigest)
import Data.Either (partitionEithers)
import Data.Function ((&))
+import Data.HashMap.Internal qualified as HM
import Data.List
import Data.List.NonEmpty qualified as N
import Data.Map qualified as M
-import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList)
+import Data.Maybe (fromMaybe, mapMaybe)
import Data.Ord (comparing)
import Data.Set qualified as S
import Data.String (IsString (fromString))
import Data.Text qualified as T
import Data.Text.Encoding qualified as T
import Data.Text.IO qualified as T
-import Data.Time.Clock (UTCTime, getCurrentTime)
-import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show)
+import Data.Time.Clock (getCurrentTime)
+import Data.Time.Format.ISO8601 (iso8601ParseM)
import Debug.Trace
+import Document qualified as D
import GHC.Conc (getNumProcessors)
-import GHC.Generics (Generic)
-import GHC.Records (HasField (..))
import Options.Applicative qualified as O
+import ProbabilityMap qualified as C
import Prompt qualified as P
import Settings qualified as S
+import Store qualified as R
import System.Directory
import System.Environment (getEnv)
import System.FilePath
-import System.IO.LockFile (withLockFile)
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
import Tag qualified as G
import Text.Printf (printf)
+import Text.RE.Replace qualified as R
import Text.RE.TDFA.Text qualified as R
import Text.Read (readMaybe)
@@ -291,13 +293,13 @@ main = do
keep
)
(map (cwd </>) inputs)
- docs <- mapM (readDocument . (<.> "json")) indexNames
- allDocs <- getDocuments
+ docs <- mapM (D.readDocument . (<.> "json")) indexNames
+ probabilityCache <- C.readProbabilityCache
docs' <-
if
- | auto -> processDocuments settings allDocs docs
+ | auto -> processDocuments settings probabilityCache docs
-- TODO adding tags interactively through prompt does not persist them in store
- | prompt -> processDocumentsInteractively settings allDocs docs
+ | prompt -> processDocumentsInteractively settings probabilityCache docs
| otherwise -> pure docs
mapM_
( \doc -> do
@@ -307,7 +309,7 @@ main = do
docs'
Args {cmd = Edit {indexNames}} -> do
editDocuments
- =<< mapM (readDocument . (<.> "json")) indexNames
+ =<< mapM (D.readDocument . (<.> "json")) indexNames
Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do
doRedoIf filters redo
mapM_
@@ -316,24 +318,26 @@ main = do
printTags doc
)
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
Args {cmd = List {filters, redo, edit = True}} -> do
doRedoIf filters redo
editDocuments
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
Args {cmd = List {filters, redo, todo = True}} -> do
doRedoIf filters redo
- allDocs <- getDocuments
+ allDocs <- D.getDocuments
+ probabilityCache <- C.readProbabilityCache
_ <-
- processDocumentsInteractively settings allDocs
+ processDocumentsInteractively settings probabilityCache
. filter (G.applyFilters filters . (.index.tags))
$ allDocs
pure ()
Args {cmd = Todo} -> do
- allDocs <- getDocuments
+ allDocs <- D.getDocuments
+ probabilityCache <- C.readProbabilityCache
_ <-
- processDocumentsInteractively settings allDocs
+ processDocumentsInteractively settings probabilityCache
. filter (G.applyFilters [G.filter G.include "todo" Nothing] . (.index.tags))
$ allDocs
pure ()
@@ -341,12 +345,12 @@ main = do
doRedoIf filters redo
viewDocuments
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
Args {cmd = View {indexNames}} -> do
viewDocuments
- =<< mapM (readDocument . (<.> "json")) indexNames
+ =<< mapM (D.readDocument . (<.> "json")) indexNames
Args {cmd = TopWords} -> do
- allDocs <- getDocuments
+ probabilityCache <- C.readProbabilityCache
mapM_
( \(tag, xs) -> do
print tag
@@ -355,24 +359,23 @@ main = do
$ map (second (sortBy (comparing (negate . snd))))
$ map (second (filter ((> 0) . snd)))
$ M.toList
- $ foldl
- ( \wordProbabilityPerTag (word, tag, p) ->
+ $ M.foldlWithKey
+ ( \wordProbabilityPerTag (word, tag) p ->
M.insertWith (++) tag [(word, p)] wordProbabilityPerTag
)
M.empty
- (probabilityMap allDocs)
+ probabilityCache.probabilityMap
Args {cmd = Modify {indexNames, tags, untags}} -> do
- docs <- mapM (readDocument . (<.> "json")) indexNames
+ docs <- mapM (D.readDocument . (<.> "json")) indexNames
mapM_
- ( \doc -> do
- let doc' = addTags tags (removeTags untags doc)
- withGit do
- J.encodeFile doc'.iFilePath doc'.index
- commitAll [doc.iFilePath] (printf "tag %s" (takeBaseName doc.iFilePath))
+ ( \doc ->
+ R.replaceDocument
+ (printf "tag %s" (takeBaseName doc.iFilePath))
+ doc
)
- docs
+ (map (addTags tags . removeTags untags) docs)
-printTags :: Document -> IO ()
+printTags :: D.Document -> IO ()
printTags doc =
mapM_
( \tag ->
@@ -380,76 +383,36 @@ printTags doc =
Nothing -> printf "@%s\n" (G.tagKey tag)
Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue
)
- ( filter
- ( \tag ->
- -- TODO Handle tags hidden by default
- --
- -- - Add `list-default-hide-tags` to `Settings`
- -- - Add `--show-tags`, `--hide-tags` to `List`
- not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag)
- && (G.tagKey tag /= "content")
- && (G.tagKey tag /= "importedFrom")
- && (G.tagKey tag /= "language")
- )
+ ( filter arbitrarilySelectTag
. S.toList
$ doc.index.tags `S.union` doc.index.internalTags
)
+arbitrarilySelectTag :: G.Tag -> Bool
+arbitrarilySelectTag tag =
+ -- TODO Handle tags hidden by default
+ --
+ -- - Add `list-default-hide-tags` to `Settings`
+ -- - Add `--show-tags`, `--hide-tags` to `List`
+ not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag)
+ && (G.tagKey tag /= "content")
+ && (G.tagKey tag /= "importedFrom")
+ && (G.tagKey tag /= "language")
+
doRedoIf :: [G.Filter] -> Bool -> IO ()
doRedoIf filters redo =
when redo do
parMapM_ doRedo
. filter (G.applyFilters filters . (.index.tags))
- =<< getDocuments
+ =<< D.getDocuments
where
doRedo doc = do
content <- ocr doc.index.language doc.oFilePath
- withGit do
- J.encodeFile doc.iFilePath doc.index {content = content}
- commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath))
-
-data Document = Document
- { iFileName :: String,
- index :: Index
- }
- deriving (Show)
-
-instance HasField "oFilePath" Document FilePath where
- getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
-
-instance HasField "iFilePath" Document FilePath where
- getField doc = "index" </> doc.iFileName
-
-hasWord :: T.Text -> Document -> Bool
-hasWord word doc = S.member word doc.index.originalWords
-
-hasTag :: G.Tag -> Document -> Bool
-hasTag tag doc = S.member tag doc.index.tags
-
-hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
-hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
-
-tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
-tagValues docs =
- M.unionsWith S.union $
- mapMaybe
- ( \tag ->
- M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
- )
- (S.toList (S.unions (map (.index.tags) docs)))
+ let doc' = doc {D.index = doc.index {D.content = content}}
+ R.replaceDocument (printf "redo %s" (takeBaseName doc.iFilePath)) doc'
-getDocuments :: IO [Document]
-getDocuments =
- parMapM readDocument
- =<< sort <$> listDirectory "index"
-
-readDocument :: FilePath -> IO Document
-readDocument iFileName =
- Document iFileName
- <$> decodeFile @Index ("index" </> iFileName)
-
-processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document]
-processDocumentsInteractively settings allDocs docs =
+processDocumentsInteractively :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document]
+processDocumentsInteractively settings probabilityCache docs = do
mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs)
where
numDocs = length docs
@@ -478,13 +441,13 @@ processDocumentsInteractively settings allDocs docs =
printf "%s\n" (takeBaseName doc.iFileName)
printf "%s\n" doc.index.content
processDocumentInteractively n doc
- "p" -> tagDocumentInteractively settings allDocs doc
+ "p" -> tagDocumentInteractively settings probabilityCache doc
"s" -> pure doc
"v" -> do
viewDocuments [doc]
processDocumentInteractively n doc
-viewDocuments :: [Document] -> IO ()
+viewDocuments :: [D.Document] -> IO ()
viewDocuments docs =
sh_
( "zathura "
@@ -494,7 +457,7 @@ viewDocuments docs =
)
)
-editDocuments :: [Document] -> IO ()
+editDocuments :: [D.Document] -> IO ()
editDocuments docs =
withSystemTempDirectory "apaperless" $ \tmp -> do
let fp doc = tmp </> takeBaseName doc.iFileName <.> "txt"
@@ -513,63 +476,49 @@ editDocuments docs =
mapM_
( \doc -> do
content <- T.readFile (fp doc)
- withGit do
- J.encodeFile doc.iFilePath doc.index {content = content}
- commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath))
+ let doc' = doc {D.index = doc.index {D.content = content}}
+ R.replaceDocument (printf "edit %s" (takeBaseName doc.iFilePath)) doc'
pure content
)
docs
-type ProbabilityMap = [(T.Text, G.Tag, Double)]
-
--- TODO Consider words that contribute to NOT adding a tag
---
--- If there is a document that should, say, not have a `@correspondent`, we should score words that contribute to that fact as well.
---
--- @topic probability-map
-probabilityMap :: [Document] -> ProbabilityMap
-probabilityMap allDocs =
- [ let docsWithWord = filter (hasWord word) allDocs
- docsWithTag = filter (hasTag tag) allDocs
- docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs
- p =
- fi (length docsWithWordAndTag)
- / fi
- ( length docsWithWord
- + length docsWithTag
- - length docsWithWordAndTag
- )
- in (word, tag, p)
- | word <- S.toList allWords,
- tag <- S.toList allTags
- ]
- where
- allTags = foldl S.union S.empty (map (.index.tags) allDocs)
- allWords = foldl S.union S.empty (map (.index.originalWords) allDocs)
- fi = fromIntegral @Int @Double
-
-suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])]
-suggestTags settings allDocs doc = do
- forM settings.suggestedTags $ \suggestedTag -> do
+suggestTags :: S.Settings -> C.ProbabilityCache -> D.Document -> [(G.Tag, [G.Tag])]
+suggestTags settings probabilityCache doc = do
+ flip map settings.suggestedTags $ \suggestedTag ->
case suggestedTag of
- S.SuggestTagByRE tagName pattern -> do
+ S.SuggestTagByRE tagName pattern ->
let tagValues =
- nub . catMaybes . map R.matchedText . R.allMatches $
- doc.index.content
- R.*=~ pattern
- pure (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues)
- S.SuggestTagByTags tagName -> do
+ nub
+ . mapMaybe
+ ( -- XXX Whys is this so complicated?
+ \match -> do
+ captures <- snd <$> R.matchCaptures match
+ pure
+ ( foldl
+ ( \template (captureName, captureOrdinal) ->
+ T.replace
+ ("${" <> R.getCaptureName captureName <> "}")
+ (R.capturedText (captures !! (fromEnum captureOrdinal - 1)))
+ template
+ )
+ (R.getTemplate pattern)
+ (HM.toList $ R.captureNames match)
+ )
+ )
+ $ R.allMatches (doc.index.content R.*=~ (R.getSearch pattern))
+ in (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues)
+ S.SuggestTagByTags tagName ->
let tagValues =
- -- TODO Cache `probabilityMap`
+ -- TODO Cache `probabilityCache`
--
-- @topic probability-map
- probabilityMap allDocs
- & filter
- ( \(word, tag, _) ->
- G.tagKey tag == tagName && hasWord word doc
+ probabilityCache.probabilityMap
+ & M.filterWithKey
+ ( \(word, tag) _ ->
+ G.tagKey tag == tagName && D.hasWord word doc
)
- & foldl'
- ( \scorePerTagValue (_, tag, p) ->
+ & M.foldlWithKey'
+ ( \scorePerTagValue (_, tag) p ->
M.insertWith
(+)
(G.tagValue tag)
@@ -580,7 +529,7 @@ suggestTags settings allDocs doc = do
& M.toList
& sortBy (comparing (negate . snd))
& map fst
- pure (G.tag tagName (Just ""), map (G.tag tagName) tagValues)
+ in (G.tag tagName (Just ""), map (G.tag tagName) tagValues)
autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag]
autoApplySuggestedTags =
@@ -591,28 +540,26 @@ autoApplySuggestedTags =
else Just (head suggestedTags)
)
-processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document]
-processDocuments settings allDocs docs =
+processDocuments :: S.Settings -> C.ProbabilityCache -> [D.Document] -> IO [D.Document]
+processDocuments settings probabilityCache docs =
mapM processDocument docs
where
processDocument doc = do
- tags <-
- S.fromList . autoApplySuggestedTags
- <$> suggestTags settings allDocs doc
+ let tags =
+ S.fromList . autoApplySuggestedTags $
+ suggestTags settings probabilityCache doc
let doc' =
doc
- { index =
+ { D.index =
doc.index
- { tags = doc.index.tags `S.union` tags,
- todo = False
+ { D.tags = doc.index.tags `S.union` tags,
+ D.todo = False
}
}
- withGit do
- J.encodeFile doc.iFilePath doc'.index
- commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath)
+ R.replaceDocument (printf "process %s (auto)" doc.iFilePath) doc'
pure doc'
-applyTags :: [Either T.Text G.Tag] -> Document -> Document
+applyTags :: [Either T.Text G.Tag] -> D.Document -> D.Document
applyTags tags' doc = do
addTags tags (removeTags untags doc)
where
@@ -625,43 +572,43 @@ data AddTagException = AddTagException G.Tag
instance Exception AddTagException
-addTags :: [G.Tag] -> Document -> Document
+addTags :: [G.Tag] -> D.Document -> D.Document
addTags =
flip (foldl (flip addTag))
-addTag :: G.Tag -> Document -> Document
+addTag :: G.Tag -> D.Document -> D.Document
addTag tag doc =
if
| G.tagKey tag == "todo" ->
- doc {index = doc.index {todo = True}}
+ doc {D.index = doc.index {D.todo = True}}
| G.tagKey tag == "addedAt" ->
maybe
(throw (AddTagException tag))
( \addedAt ->
- doc {index = doc.index {addedAt = addedAt}}
+ doc {D.index = doc.index {D.addedAt = addedAt}}
)
(iso8601ParseM . T.unpack =<< (G.tagValue tag))
| G.tagKey tag == "modifiedAt" ->
maybe
(throw (AddTagException tag))
( \modifiedAt ->
- doc {index = doc.index {modifiedAt = Just modifiedAt}}
+ doc {D.index = doc.index {D.modifiedAt = Just modifiedAt}}
)
(iso8601ParseM . T.unpack =<< (G.tagValue tag))
| G.tagKey tag == "content" ->
maybe
(throw (AddTagException tag))
( \content ->
- doc {index = doc.index {content = content}}
+ doc {D.index = doc.index {D.content = content}}
)
(G.tagValue tag)
| G.tagKey tag == "language" ->
throw (AddTagException tag)
| otherwise ->
doc
- { index =
+ { D.index =
doc.index
- { tags = G.replace tag doc.index.tags
+ { D.tags = G.replace tag doc.index.tags
}
}
@@ -670,15 +617,15 @@ data RemoveTagException = RemoveTagException G.Tag
instance Exception RemoveTagException
-removeTags :: [G.Tag] -> Document -> Document
+removeTags :: [G.Tag] -> D.Document -> D.Document
removeTags =
flip (foldl (flip removeTag))
-removeTag :: G.Tag -> Document -> Document
+removeTag :: G.Tag -> D.Document -> D.Document
removeTag tag doc =
if
| G.tagKey tag == "todo" ->
- doc {index = doc.index {todo = False}}
+ doc {D.index = doc.index {D.todo = False}}
| G.tagKey tag == "addedAt" ->
throw (RemoveTagException tag)
| G.tagKey tag == "modifiedAt" ->
@@ -689,9 +636,9 @@ removeTag tag doc =
throw (RemoveTagException tag)
| otherwise ->
doc
- { index =
+ { D.index =
doc.index
- { tags =
+ { D.tags =
maybe
(G.deleteAll (G.tagKey tag))
(\_ -> G.delete tag)
@@ -700,18 +647,16 @@ removeTag tag doc =
}
}
-tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document
-tagDocumentInteractively settings allDocs doc = do
- suggestedTags <- suggestTags settings allDocs doc
+tagDocumentInteractively :: S.Settings -> C.ProbabilityCache -> D.Document -> IO D.Document
+tagDocumentInteractively settings probabilityCache doc = do
+ let suggestedTags = suggestTags settings probabilityCache doc
tags <- mapM (uncurry tagDocumentInteractively') suggestedTags
let doc' =
(applyTags tags doc)
- { index = doc.index {todo = False}
+ { D.index = doc.index {D.todo = False}
}
- withGit do
- J.encodeFile doc.iFilePath doc'.index
- commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath)
- pure doc'
+ R.replaceDocument (printf "process %s (interactive)" doc.iFilePath) doc'
+ pure doc'
where
tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag)
tagDocumentInteractively' tag tags
@@ -770,30 +715,19 @@ consume1 language force keep filePath = do
then ocr language filePath
else pure content'
addedAt <- getCurrentTime
- withGit do
- J.encodeFile
- iFilePath
- Index
- { tags = S.empty,
- todo = True,
- modifiedAt = Nothing,
- ..
- }
- if keep
- then copyFile filePath oFilePath
- else renameFile filePath oFilePath
- commitAll
- [iFilePath, oFilePath]
- (printf "add %s" (takeFileName filePath))
- pure (takeBaseName iFilePath)
-
-withGit :: IO a -> IO a
-withGit = withLockFile def ".gitlock"
+ let doc =
+ D.Document (fKey <.> "json") $
+ D.Index
+ { tags = S.empty,
+ todo = True,
+ modifiedAt = Nothing,
+ ..
+ }
+ R.commitDocument (printf "add %s" (takeFileName filePath)) filePath doc
+ when (not keep) do
+ removeFile filePath
-commitAll :: [FilePath] -> String -> IO ()
-commitAll fps m = do
- sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps))
- sh_ (printf ">/dev/null git commit -m '%s' || :" m)
+ pure (takeBaseName iFilePath)
data DecodeException = DecodeException FilePath String
deriving (Show)
@@ -839,49 +773,6 @@ ocr1 language tmp input =
-- XXX `--oem 1` seems to be unavailable
<$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language)
-data Index = Index
- { content :: T.Text,
- tags :: S.Set G.Tag,
- addedAt :: UTCTime,
- modifiedAt :: Maybe UTCTime,
- todo :: Bool,
- language :: String
- }
- deriving (Show, Generic, Eq)
-
-instance J.ToJSON Index
-
-instance J.FromJSON Index
-
-instance HasField "shortText" Index T.Text where
- getField =
- (T.unlines . take 10 . T.lines)
- . (.content)
-
-instance HasField "internalTags" Index (S.Set G.Tag) where
- getField index =
- index.tags
- `S.union` internalTags index
-
--- TODO Cache `originalWords`
---
--- @related cache-probabilitymap
-instance HasField "originalWords" Index (S.Set T.Text) where
- getField index =
- S.fromList (T.words index.content)
-
-internalTags :: Index -> S.Set G.Tag
-internalTags index =
- S.fromList
- ( concat
- [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
- G.tag "language" (Just (T.pack index.language))
- ],
- maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
- if index.todo then [G.tag "todo" Nothing] else []
- ]
- )
-
data PdfInfo = PdfInfo
{ numPages :: Int,
pageSize :: (Double, Double)
diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs
new file mode 100644
index 0000000..9a700d7
--- /dev/null
+++ b/app/ProbabilityMap.hs
@@ -0,0 +1,230 @@
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE StrictData #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE NoFieldSelectors #-}
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+
+module ProbabilityMap
+ ( fileName,
+ ProbabilityCache (..),
+ readProbabilityCache,
+ writeProbabilityCache,
+ modifyProbabilityCache,
+ fromDocuments,
+ addDocument,
+ deleteDocument,
+ replaceDocument,
+ )
+where
+
+import Control.Applicative (liftA2)
+import Control.Arrow ((***))
+import Control.DeepSeq (NFData (..))
+import Control.Exception (SomeException, try)
+import Control.Parallel.Strategies
+import Data.Binary qualified as B
+import Data.List (foldl')
+import Data.Map qualified as M
+import Data.Maybe (fromMaybe)
+import Data.Set qualified as S
+import Data.Text qualified as T
+import Data.Vector qualified as V
+import Data.Vector.Binary ()
+import Document qualified as D
+import GHC.Conc (numCapabilities)
+import GHC.Generics (Generic)
+import Tag qualified as G
+
+data ProbabilityCache = ProbabilityCache
+ { docsPerWord :: M.Map T.Text Int,
+ docsPerTag :: M.Map G.Tag Int,
+ docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int,
+ probabilityMap :: M.Map (T.Text, G.Tag) Double
+ }
+ deriving (Show, Generic)
+
+data ProbabilityCache' = ProbabilityCache'
+ { docsPerWord :: M.Map T.Text Int,
+ docsPerTag :: M.Map G.Tag Int,
+ docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int
+ }
+ deriving (Show, Generic)
+
+instance NFData ProbabilityCache' where
+ rnf (ProbabilityCache' {..}) =
+ rnf docsPerWord `seq`
+ rnf docsPerTag `seq`
+ rnf docsPerWordAndTag
+
+fromProbabilityCache' :: ProbabilityCache' -> ProbabilityCache
+fromProbabilityCache' probabilityCache'@(ProbabilityCache' {..}) =
+ ProbabilityCache {probabilityMap = probabilityMap probabilityCache', ..}
+
+toProbabilityCache' :: ProbabilityCache -> ProbabilityCache'
+toProbabilityCache' (ProbabilityCache {..}) =
+ ProbabilityCache' {..}
+
+instance B.Binary ProbabilityCache where
+ get = fromCacheRep <$> B.get
+ put = B.put . toCacheRep
+
+instance NFData ProbabilityCache where
+ rnf (ProbabilityCache {..}) =
+ rnf docsPerWord `seq`
+ rnf docsPerTag `seq`
+ rnf docsPerWordAndTag `seq`
+ rnf probabilityMap
+
+data CacheRep = CacheRep
+ { allWords :: V.Vector T.Text,
+ allTags :: V.Vector G.Tag,
+ docsPerWord :: M.Map Int Int,
+ docsPerTag :: M.Map Int Int,
+ docsPerWordAndTag :: M.Map (Int, Int) Int,
+ probabilityMap :: M.Map (Int, Int) Double
+ }
+ deriving (Show, Generic, B.Binary)
+
+instance NFData CacheRep where
+ rnf (CacheRep {..}) =
+ rnf allWords `seq`
+ rnf allTags `seq`
+ rnf docsPerWord `seq`
+ rnf docsPerTag `seq`
+ rnf docsPerWordAndTag `seq`
+ rnf probabilityMap
+
+fromCacheRep :: CacheRep -> ProbabilityCache
+fromCacheRep (CacheRep {..}) =
+ ProbabilityCache
+ { docsPerWord = M.mapKeys fromWord docsPerWord,
+ docsPerTag = M.mapKeys fromTag docsPerTag,
+ docsPerWordAndTag = M.mapKeys fromWordAndTag docsPerWordAndTag,
+ probabilityMap = M.mapKeys fromWordAndTag probabilityMap
+ }
+ where
+ fromWord = (allWords V.!)
+ fromTag = (allTags V.!)
+ fromWordAndTag = fromWord *** fromTag
+
+toCacheRep :: ProbabilityCache -> CacheRep
+toCacheRep (ProbabilityCache {..}) =
+ CacheRep
+ { allWords = V.fromList allWords',
+ allTags = V.fromList allTags',
+ docsPerWord = M.mapKeys toWord docsPerWord,
+ docsPerTag = M.mapKeys toTag docsPerTag,
+ docsPerWordAndTag = M.mapKeys toWordAndTag docsPerWordAndTag,
+ probabilityMap = M.mapKeys toWordAndTag probabilityMap
+ }
+ where
+ allWords' = M.keys docsPerWord
+ allTags' = M.keys docsPerTag
+ allWordsMap = M.fromList $ zip allWords' [0 ..]
+ allTagsMap = M.fromList $ zip allTags' [0 ..]
+ toWord = (allWordsMap M.!)
+ toTag = (allTagsMap M.!)
+ toWordAndTag = toWord *** toTag
+
+fileName :: FilePath
+fileName = "probabilityCache"
+
+readProbabilityCache :: IO ProbabilityCache
+readProbabilityCache =
+ try (B.decodeFile fileName) >>= \case
+ Left (_ :: SomeException) -> fromDocuments <$> D.getDocuments
+ Right probabilityCache -> pure probabilityCache
+
+writeProbabilityCache :: ProbabilityCache -> IO ()
+writeProbabilityCache probabilityCache =
+ B.encodeFile fileName probabilityCache
+
+modifyProbabilityCache :: (ProbabilityCache -> ProbabilityCache) -> IO ProbabilityCache
+modifyProbabilityCache f =
+ liftA2 (>>) (writeProbabilityCache . f) pure =<< readProbabilityCache
+
+fromDocuments :: [D.Document] -> ProbabilityCache
+fromDocuments allDocs =
+ fromProbabilityCache' $
+ foldl' (flip addDocument') (ProbabilityCache' M.empty M.empty M.empty) allDocs
+
+probabilityMap :: ProbabilityCache' -> M.Map (T.Text, G.Tag) Double
+probabilityMap (ProbabilityCache' {..}) =
+ M.unions
+ . withStrategy (parList rdeepseq)
+ . map
+ ( M.fromList
+ . filter (\(_, p) -> p > 0)
+ . map (\k@(word, tag) -> (k, probability word tag))
+ )
+ $ streamsOf
+ numCapabilities
+ [ (word, tag)
+ | word <- allWords,
+ tag <- allTags
+ ]
+ where
+ probability word tag =
+ let docsWithWord = docsPerWord M.! word
+ docsWithTag = docsPerTag M.! tag
+ docsWithWordAndTag = fromMaybe 0 $ M.lookup (word, tag) docsPerWordAndTag
+ in fi docsWithWordAndTag / fi (docsWithWord + docsWithTag - docsWithWordAndTag)
+
+ allWords = M.keys docsPerWord
+ allTags = M.keys docsPerTag
+ fi = fromIntegral @Int @Double
+
+streamsOf :: Int -> [a] -> [[a]]
+streamsOf 1 xs = [xs]
+streamsOf n xs
+ | n > 0 = [everyN k xs | k <- [0 .. n - 1]]
+ | otherwise = []
+ where
+ everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs
+
+addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
+addDocument doc probabilityCache =
+ fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache)
+
+deleteDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
+deleteDocument doc probabilityCache =
+ fromProbabilityCache' $ deleteDocument' doc (toProbabilityCache' probabilityCache)
+
+replaceDocument :: D.Document -> D.Document -> ProbabilityCache -> ProbabilityCache
+replaceDocument oldDoc newDoc probabilityCache =
+ fromProbabilityCache' $
+ (addDocument' newDoc . deleteDocument' oldDoc) (toProbabilityCache' probabilityCache)
+
+addDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache'
+addDocument' doc probabilityCache =
+ combining (+) probabilityCache (fromDocument doc)
+
+deleteDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache'
+deleteDocument' doc probabilityCache =
+ combining (-) probabilityCache (fromDocument doc)
+
+combining :: (Int -> Int -> Int) -> ProbabilityCache' -> ProbabilityCache' -> ProbabilityCache'
+combining f probabilityCache probabilityCache' =
+ ProbabilityCache' {..}
+ where
+ docsPerWord = M.unionWith f probabilityCache.docsPerWord probabilityCache'.docsPerWord
+ docsPerTag = M.unionWith f probabilityCache.docsPerTag probabilityCache'.docsPerTag
+ docsPerWordAndTag = M.unionWith f probabilityCache.docsPerWordAndTag probabilityCache'.docsPerWordAndTag
+
+fromDocument :: D.Document -> ProbabilityCache'
+fromDocument doc =
+ let words = S.toList doc.index.originalWords
+ tags = S.toList doc.index.tags
+ wordAndTags = [(word, tag) | word <- words, tag <- tags]
+ in ProbabilityCache'
+ { docsPerWord = docsPer words,
+ docsPerTag = docsPer tags,
+ docsPerWordAndTag = docsPer wordAndTags
+ }
+ where
+ docsPer :: Ord k => [k] -> M.Map k Int
+ docsPer = M.fromList . map (flip (,) 1)
diff --git a/app/Settings.hs b/app/Settings.hs
index 08b40bb..43f721c 100644
--- a/app/Settings.hs
+++ b/app/Settings.hs
@@ -16,6 +16,7 @@ import Data.Aeson qualified as A
import Data.Function (on)
import Data.List (nub)
import Data.Text qualified as T
+import Data.Vector qualified as V
import Data.Yaml (decodeFileThrow, encodeFile)
import GHC.Generics (Generic)
import System.Directory (doesFileExist)
@@ -47,7 +48,9 @@ instance Monoid Settings where
-- - perform minor corrections on OCR, ie. parse `0 1.01.1970`
--
-- @topic suggested-tags
- SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|],
+ SuggestTagByRE
+ "createdAt"
+ [R.ed|${d}([0-9]{2})\.${m}([0-9]{2})\.${y}([0-9]{4})///${y}-${m}-${d}|],
SuggestTagByTags "correspondent"
],
defaultLanguage = "deu+eng"
@@ -58,7 +61,7 @@ instance A.FromJSON Settings
instance A.ToJSON Settings
data SuggestedTag
- = SuggestTagByRE T.Text R.RE
+ = SuggestTagByRE T.Text (R.SearchReplace R.RE T.Text)
| SuggestTagByTags T.Text
deriving (Show, Generic, Eq)
@@ -68,6 +71,10 @@ instance Show R.RE where
instance Eq R.RE where
(==) = (==) `on` show
+instance (Eq re, Eq s) => Eq (R.SearchReplace re s) where
+ a == b =
+ ((==) `on` R.getSearch) a b && (((==) `on` R.getTemplate) a b)
+
instance A.FromJSON SuggestedTag
instance A.ToJSON SuggestedTag
@@ -78,6 +85,20 @@ instance A.FromJSON R.RE where
instance A.ToJSON R.RE where
toJSON = A.toJSON . R.reSource
+instance (A.FromJSON re, A.FromJSON s) => A.FromJSON (R.SearchReplace re s) where
+ parseJSON =
+ A.withArray
+ "regular search/replace expression (POSIX)"
+ ( \a -> do
+ re <- a `V.indexM` 0
+ s <- a `V.indexM` 1
+ R.SearchReplace <$> A.parseJSON re <*> A.parseJSON s
+ )
+
+instance (A.ToJSON re, A.ToJSON s) => A.ToJSON (R.SearchReplace re s) where
+ toJSON (R.SearchReplace re s) =
+ A.Array (V.fromList [A.toJSON re, A.toJSON s])
+
readSettings :: IO Settings
readSettings =
fmap mconcat
diff --git a/app/Store.hs b/app/Store.hs
new file mode 100644
index 0000000..7a4637f
--- /dev/null
+++ b/app/Store.hs
@@ -0,0 +1,60 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+
+module Store
+ ( commitDocument,
+ replaceDocument,
+ )
+where
+
+import Control.Exception (Exception, throwIO)
+import Data.Aeson qualified as J
+import Data.ByteString.Lazy qualified as LB
+import Data.Default (Default (def))
+import Data.List (intercalate)
+import Data.String (IsString (fromString))
+import Document qualified as D
+import ProbabilityMap qualified as C
+import System.Directory (copyFile)
+import System.Exit (ExitCode (ExitFailure, ExitSuccess))
+import System.IO.LockFile (withLockFile)
+import System.Process.Typed (readProcessStderr)
+import Text.Printf (printf)
+
+replaceDocument :: String -> D.Document -> IO ()
+replaceDocument message doc = do
+ withGit do
+ J.encodeFile doc.iFilePath doc.index
+ oldDoc <- D.readDocument doc.iFileName
+ _ <- C.modifyProbabilityCache (C.replaceDocument oldDoc doc)
+ commitAll [doc.iFilePath, C.fileName] message
+
+commitDocument :: String -> FilePath -> D.Document -> IO ()
+commitDocument message original doc = do
+ withGit do
+ copyFile original doc.oFilePath
+ J.encodeFile doc.iFilePath doc.index
+ _ <- C.modifyProbabilityCache (C.addDocument doc)
+ commitAll [doc.iFilePath, doc.oFilePath, C.fileName] message
+
+withGit :: IO a -> IO a
+withGit = withLockFile def ".gitlock"
+
+commitAll :: [FilePath] -> String -> IO ()
+commitAll fps m = do
+ sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps))
+ sh_ (printf ">/dev/null git commit -m '%s' || :" m)
+
+-- TODO Refacotor library `sh`
+sh_ :: String -> IO ()
+sh_ cmd = do
+ -- printf "+ %s\n" cmd
+ (exitCode, err) <- readProcessStderr (fromString cmd)
+ case exitCode of
+ ExitSuccess -> return ()
+ ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err
+
+data ProcessException = ProcessException Int LB.ByteString
+ deriving (Show)
+
+instance Exception ProcessException
diff --git a/tags/src/Tag.hs b/tags/src/Tag.hs
index f7f3398..882f1de 100644
--- a/tags/src/Tag.hs
+++ b/tags/src/Tag.hs
@@ -51,6 +51,7 @@ module Tag
where
import Control.Applicative ((<|>))
+import Control.DeepSeq (NFData (rnf))
import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.Binary (Binary)
@@ -66,6 +67,9 @@ import TypedValue (cast, castDef)
data Tag = Tag T.Text (Maybe T.Text)
deriving (Show, Generic, Binary, Eq, Ord)
+instance NFData Tag where
+ rnf (Tag k v) = rnf k `seq` rnf v
+
tag :: T.Text -> Maybe T.Text -> Tag
tag = Tag
diff --git a/tags/tags.cabal b/tags/tags.cabal
index 0149e74..c93aedd 100644
--- a/tags/tags.cabal
+++ b/tags/tags.cabal
@@ -26,7 +26,8 @@ library
containers,
regex,
text,
- time
+ time,
+ deepseq
hs-source-dirs: src
default-language: GHC2021
default-extensions: