summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2024-01-11 03:27:17 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2024-01-11 08:12:02 +0100
commit75ace0b26691848b9a16a9a0ec1880110e8262b8 (patch)
treebefda6536685ebc0472f5b0273ad25fdf8eb0ae4
parent7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (diff)
feat: add `check` cmd
-rw-r--r--apaperless.cabal1
-rw-r--r--app/Document.hs22
-rw-r--r--app/Main.hs47
-rw-r--r--app/Parallel.hs22
-rw-r--r--app/ProbabilityMap.hs11
5 files changed, 84 insertions, 19 deletions
diff --git a/apaperless.cabal b/apaperless.cabal
index 1217b15..8e26457 100644
--- a/apaperless.cabal
+++ b/apaperless.cabal
@@ -21,6 +21,7 @@ executable apaperless
main-is: Main.hs
other-modules:
Document
+ Parallel
ProbabilityMap
Prompt
Settings
diff --git a/app/Document.hs b/app/Document.hs
index ac8a73b..fa9f33f 100644
--- a/app/Document.hs
+++ b/app/Document.hs
@@ -1,6 +1,7 @@
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE StrictData #-}
module Document
@@ -15,7 +16,7 @@ module Document
)
where
-import Control.Concurrent.ParallelIO.Local (parallel, withPool)
+import Control.DeepSeq (NFData (rnf))
import Control.Exception (Exception, throwIO)
import Data.Aeson qualified as J
import Data.ByteString.Lazy qualified as LB
@@ -26,9 +27,9 @@ import Data.Set qualified as S
import Data.Text qualified as T
import Data.Time.Clock (UTCTime)
import Data.Time.Format.ISO8601 (iso8601Show)
-import GHC.Conc (getNumProcessors)
import GHC.Generics (Generic)
import GHC.Records (HasField (..))
+import Parallel (parMapM)
import System.Directory (listDirectory)
import System.FilePath (takeBaseName, (<.>), (</>))
import Tag qualified as G
@@ -42,6 +43,9 @@ data Document = Document
}
deriving (Show)
+instance NFData Document where
+ rnf (Document {..}) = rnf iFileName `seq` rnf index
+
instance HasField "oFilePath" Document FilePath where
getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
@@ -86,6 +90,15 @@ data Index = Index
}
deriving (Show, Generic, Eq)
+instance NFData Index where
+ rnf (Index {..}) =
+ rnf content `seq`
+ rnf tags `seq`
+ rnf addedAt `seq`
+ rnf modifiedAt `seq`
+ rnf todo `seq`
+ rnf language
+
instance J.ToJSON Index
instance J.FromJSON Index
@@ -118,11 +131,6 @@ internalTags index =
]
)
-parMapM :: (a -> IO b) -> [a] -> IO [b]
-parMapM f xs = do
- n <- getNumProcessors
- withPool n $ \pool -> parallel pool (map f xs)
-
data DecodeException = DecodeException FilePath String
deriving (Show)
diff --git a/app/Main.hs b/app/Main.hs
index 936f053..ebf94c9 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -18,6 +18,7 @@ import Control.Arrow (first, second)
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throw, throwIO)
import Control.Monad (unless, when)
+import Control.Parallel.Strategies (parList, rdeepseq, withStrategy)
import Data.Aeson qualified as J
import Data.Attoparsec.Text qualified as A
import Data.ByteString.Lazy qualified as LB
@@ -28,7 +29,7 @@ import Data.HashMap.Internal qualified as HM
import Data.List
import Data.List.NonEmpty qualified as N
import Data.Map qualified as M
-import Data.Maybe (fromMaybe, mapMaybe)
+import Data.Maybe (catMaybes, fromMaybe, mapMaybe)
import Data.Ord (comparing)
import Data.Set qualified as S
import Data.String (IsString (fromString))
@@ -39,8 +40,9 @@ import Data.Time.Clock (getCurrentTime)
import Data.Time.Format.ISO8601 (iso8601ParseM)
import Debug.Trace
import Document qualified as D
-import GHC.Conc (getNumProcessors)
+import GHC.Conc (getNumProcessors, numCapabilities)
import Options.Applicative qualified as O
+import Parallel (streamsOf)
import ProbabilityMap qualified as C
import Prompt qualified as P
import Settings qualified as S
@@ -89,6 +91,7 @@ data Cmd
tags :: [G.Tag],
untags :: [G.Tag]
}
+ | Check
args :: O.Parser Args
args =
@@ -110,7 +113,9 @@ cmd =
O.command "topwords" . O.info topWordsCmd $
O.progDesc "View probability map per tag",
O.command "modify" . O.info modifyCmd $
- O.progDesc "Modify document(s)"
+ O.progDesc "Modify document(s)",
+ O.command "check" . O.info checkCmd $
+ O.progDesc "Check document tags"
]
consumeCmd :: O.Parser Cmd
@@ -157,6 +162,10 @@ modifyCmd =
<*> tagsArg
<*> untagsArg
+checkCmd :: O.Parser Cmd
+checkCmd =
+ pure Check
+
inputsArg :: O.Parser [FilePath]
inputsArg =
O.many
@@ -374,6 +383,38 @@ main = do
doc
)
(map (addTags tags . removeTags untags) docs)
+ Args {cmd = Check} -> do
+ allDocs <- D.getDocuments
+ probabilityCache <- C.readProbabilityCache
+ mapM_
+ ( \(doc, tags, tags') -> do
+ printf "%s\n" (takeBaseName doc.iFileName)
+ printTags doc
+ mapM_
+ (\tag -> printf ("- %s\n") (show tag))
+ (tags `S.difference` tags')
+ mapM_
+ (\tag -> printf ("+ %s\n") (show tag))
+ (tags' `S.difference` tags)
+ )
+ ( catMaybes
+ ( concat
+ ( withStrategy
+ (parList rdeepseq)
+ ( streamsOf
+ numCapabilities
+ ( map
+ ( \doc ->
+ let tags = S.filter arbitrarilySelectTag doc.index.tags
+ tags' = S.fromList $ autoApplySuggestedTags $ suggestTags settings probabilityCache doc
+ in if tags == tags' then Nothing else Just (doc, tags, tags')
+ )
+ allDocs
+ )
+ )
+ )
+ )
+ )
printTags :: D.Document -> IO ()
printTags doc =
diff --git a/app/Parallel.hs b/app/Parallel.hs
new file mode 100644
index 0000000..02252ff
--- /dev/null
+++ b/app/Parallel.hs
@@ -0,0 +1,22 @@
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+
+module Parallel
+ ( streamsOf,
+ parMapM,
+ )
+where
+
+import Control.Concurrent.ParallelIO.Local (parallel, withPool)
+import GHC.Conc (numCapabilities)
+
+streamsOf :: Int -> [a] -> [[a]]
+streamsOf 1 xs = [xs]
+streamsOf n xs
+ | n > 0 = [everyN k xs | k <- [0 .. n - 1]]
+ | otherwise = []
+ where
+ everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs
+
+parMapM :: (a -> IO b) -> [a] -> IO [b]
+parMapM f xs = do
+ withPool numCapabilities $ \pool -> parallel pool (map f xs)
diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs
index 9a700d7..dc7067d 100644
--- a/app/ProbabilityMap.hs
+++ b/app/ProbabilityMap.hs
@@ -25,7 +25,7 @@ import Control.Applicative (liftA2)
import Control.Arrow ((***))
import Control.DeepSeq (NFData (..))
import Control.Exception (SomeException, try)
-import Control.Parallel.Strategies
+import Control.Parallel.Strategies (parList, rdeepseq, withStrategy)
import Data.Binary qualified as B
import Data.List (foldl')
import Data.Map qualified as M
@@ -37,6 +37,7 @@ import Data.Vector.Binary ()
import Document qualified as D
import GHC.Conc (numCapabilities)
import GHC.Generics (Generic)
+import Parallel (streamsOf)
import Tag qualified as G
data ProbabilityCache = ProbabilityCache
@@ -178,14 +179,6 @@ probabilityMap (ProbabilityCache' {..}) =
allTags = M.keys docsPerTag
fi = fromIntegral @Int @Double
-streamsOf :: Int -> [a] -> [[a]]
-streamsOf 1 xs = [xs]
-streamsOf n xs
- | n > 0 = [everyN k xs | k <- [0 .. n - 1]]
- | otherwise = []
- where
- everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs
-
addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
addDocument doc probabilityCache =
fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache)