diff options
author | Alexander Foremny <aforemny@posteo.de> | 2024-01-11 03:27:17 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2024-01-11 08:12:02 +0100 |
commit | 75ace0b26691848b9a16a9a0ec1880110e8262b8 (patch) | |
tree | befda6536685ebc0472f5b0273ad25fdf8eb0ae4 | |
parent | 7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (diff) |
feat: add `check` cmd
-rw-r--r-- | apaperless.cabal | 1 | ||||
-rw-r--r-- | app/Document.hs | 22 | ||||
-rw-r--r-- | app/Main.hs | 47 | ||||
-rw-r--r-- | app/Parallel.hs | 22 | ||||
-rw-r--r-- | app/ProbabilityMap.hs | 11 |
5 files changed, 84 insertions, 19 deletions
diff --git a/apaperless.cabal b/apaperless.cabal index 1217b15..8e26457 100644 --- a/apaperless.cabal +++ b/apaperless.cabal @@ -21,6 +21,7 @@ executable apaperless main-is: Main.hs other-modules: Document + Parallel ProbabilityMap Prompt Settings diff --git a/app/Document.hs b/app/Document.hs index ac8a73b..fa9f33f 100644 --- a/app/Document.hs +++ b/app/Document.hs @@ -1,6 +1,7 @@ {-# LANGUAGE DataKinds #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE StrictData #-} module Document @@ -15,7 +16,7 @@ module Document ) where -import Control.Concurrent.ParallelIO.Local (parallel, withPool) +import Control.DeepSeq (NFData (rnf)) import Control.Exception (Exception, throwIO) import Data.Aeson qualified as J import Data.ByteString.Lazy qualified as LB @@ -26,9 +27,9 @@ import Data.Set qualified as S import Data.Text qualified as T import Data.Time.Clock (UTCTime) import Data.Time.Format.ISO8601 (iso8601Show) -import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) import GHC.Records (HasField (..)) +import Parallel (parMapM) import System.Directory (listDirectory) import System.FilePath (takeBaseName, (<.>), (</>)) import Tag qualified as G @@ -42,6 +43,9 @@ data Document = Document } deriving (Show) +instance NFData Document where + rnf (Document {..}) = rnf iFileName `seq` rnf index + instance HasField "oFilePath" Document FilePath where getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf" @@ -86,6 +90,15 @@ data Index = Index } deriving (Show, Generic, Eq) +instance NFData Index where + rnf (Index {..}) = + rnf content `seq` + rnf tags `seq` + rnf addedAt `seq` + rnf modifiedAt `seq` + rnf todo `seq` + rnf language + instance J.ToJSON Index instance J.FromJSON Index @@ -118,11 +131,6 @@ internalTags index = ] ) -parMapM :: (a -> IO b) -> [a] -> IO [b] -parMapM f xs = do - n <- getNumProcessors - withPool n $ \pool -> parallel pool (map f xs) - data DecodeException = DecodeException FilePath String deriving (Show) diff --git a/app/Main.hs b/app/Main.hs index 936f053..ebf94c9 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -18,6 +18,7 @@ import Control.Arrow (first, second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (unless, when) +import Control.Parallel.Strategies (parList, rdeepseq, withStrategy) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB @@ -28,7 +29,7 @@ import Data.HashMap.Internal qualified as HM import Data.List import Data.List.NonEmpty qualified as N import Data.Map qualified as M -import Data.Maybe (fromMaybe, mapMaybe) +import Data.Maybe (catMaybes, fromMaybe, mapMaybe) import Data.Ord (comparing) import Data.Set qualified as S import Data.String (IsString (fromString)) @@ -39,8 +40,9 @@ import Data.Time.Clock (getCurrentTime) import Data.Time.Format.ISO8601 (iso8601ParseM) import Debug.Trace import Document qualified as D -import GHC.Conc (getNumProcessors) +import GHC.Conc (getNumProcessors, numCapabilities) import Options.Applicative qualified as O +import Parallel (streamsOf) import ProbabilityMap qualified as C import Prompt qualified as P import Settings qualified as S @@ -89,6 +91,7 @@ data Cmd tags :: [G.Tag], untags :: [G.Tag] } + | Check args :: O.Parser Args args = @@ -110,7 +113,9 @@ cmd = O.command "topwords" . O.info topWordsCmd $ O.progDesc "View probability map per tag", O.command "modify" . O.info modifyCmd $ - O.progDesc "Modify document(s)" + O.progDesc "Modify document(s)", + O.command "check" . O.info checkCmd $ + O.progDesc "Check document tags" ] consumeCmd :: O.Parser Cmd @@ -157,6 +162,10 @@ modifyCmd = <*> tagsArg <*> untagsArg +checkCmd :: O.Parser Cmd +checkCmd = + pure Check + inputsArg :: O.Parser [FilePath] inputsArg = O.many @@ -374,6 +383,38 @@ main = do doc ) (map (addTags tags . removeTags untags) docs) + Args {cmd = Check} -> do + allDocs <- D.getDocuments + probabilityCache <- C.readProbabilityCache + mapM_ + ( \(doc, tags, tags') -> do + printf "%s\n" (takeBaseName doc.iFileName) + printTags doc + mapM_ + (\tag -> printf ("- %s\n") (show tag)) + (tags `S.difference` tags') + mapM_ + (\tag -> printf ("+ %s\n") (show tag)) + (tags' `S.difference` tags) + ) + ( catMaybes + ( concat + ( withStrategy + (parList rdeepseq) + ( streamsOf + numCapabilities + ( map + ( \doc -> + let tags = S.filter arbitrarilySelectTag doc.index.tags + tags' = S.fromList $ autoApplySuggestedTags $ suggestTags settings probabilityCache doc + in if tags == tags' then Nothing else Just (doc, tags, tags') + ) + allDocs + ) + ) + ) + ) + ) printTags :: D.Document -> IO () printTags doc = diff --git a/app/Parallel.hs b/app/Parallel.hs new file mode 100644 index 0000000..02252ff --- /dev/null +++ b/app/Parallel.hs @@ -0,0 +1,22 @@ +{-# OPTIONS_GHC -fno-warn-name-shadowing #-} + +module Parallel + ( streamsOf, + parMapM, + ) +where + +import Control.Concurrent.ParallelIO.Local (parallel, withPool) +import GHC.Conc (numCapabilities) + +streamsOf :: Int -> [a] -> [[a]] +streamsOf 1 xs = [xs] +streamsOf n xs + | n > 0 = [everyN k xs | k <- [0 .. n - 1]] + | otherwise = [] + where + everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs + +parMapM :: (a -> IO b) -> [a] -> IO [b] +parMapM f xs = do + withPool numCapabilities $ \pool -> parallel pool (map f xs) diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs index 9a700d7..dc7067d 100644 --- a/app/ProbabilityMap.hs +++ b/app/ProbabilityMap.hs @@ -25,7 +25,7 @@ import Control.Applicative (liftA2) import Control.Arrow ((***)) import Control.DeepSeq (NFData (..)) import Control.Exception (SomeException, try) -import Control.Parallel.Strategies +import Control.Parallel.Strategies (parList, rdeepseq, withStrategy) import Data.Binary qualified as B import Data.List (foldl') import Data.Map qualified as M @@ -37,6 +37,7 @@ import Data.Vector.Binary () import Document qualified as D import GHC.Conc (numCapabilities) import GHC.Generics (Generic) +import Parallel (streamsOf) import Tag qualified as G data ProbabilityCache = ProbabilityCache @@ -178,14 +179,6 @@ probabilityMap (ProbabilityCache' {..}) = allTags = M.keys docsPerTag fi = fromIntegral @Int @Double -streamsOf :: Int -> [a] -> [[a]] -streamsOf 1 xs = [xs] -streamsOf n xs - | n > 0 = [everyN k xs | k <- [0 .. n - 1]] - | otherwise = [] - where - everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs - addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache addDocument doc probabilityCache = fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache) |