{-# LANGUAGE BlockArguments #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiWayIf #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE NoFieldSelectors #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Main where import Control.Arrow (first, second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (forM, unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.Default import Data.Digest.Pure.SHA (sha1, showDigest) import Data.Either (partitionEithers) import Data.Function ((&)) import Data.List import Data.List.NonEmpty qualified as N import Data.Map qualified as M import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList) import Data.Ord (comparing) import Data.Set qualified as S import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T import Data.Time.Clock (UTCTime, getCurrentTime) import Data.Time.Format.ISO8601 (iso8601ParseM, iso8601Show) import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) import GHC.Records (HasField (..)) import Options.Applicative qualified as O import Prompt qualified as P import Settings qualified as S import System.Directory import System.Environment (getEnv) import System.FilePath import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Tag qualified as G import Text.Printf (printf) import Text.RE.TDFA.Text qualified as R import Text.Read (readMaybe) data Args = Args { cmd :: Cmd } data Cmd = Consume { keep :: Bool, inputs :: [FilePath], prompt :: Bool, force :: Bool, language :: Maybe String, auto :: Bool } | Edit { indexNames :: [FilePath] } | List { filters :: [G.Filter], todo :: Bool, view :: Bool, redo :: Bool, edit :: Bool } | Todo | View { indexNames :: [FilePath] } | TopWords | Modify { indexNames :: [FilePath], tags :: [G.Tag], untags :: [G.Tag] } args :: O.Parser Args args = Args <$> cmd cmd :: O.Parser Cmd cmd = O.hsubparser . mconcat $ [ O.command "consume" . O.info consumeCmd $ O.progDesc "Consume document(s)", O.command "edit" . O.info editCmd $ O.progDesc "Edit document(s)", O.command "list" . O.info listCmd $ O.progDesc "List document(s)", O.command "todo" . O.info todoCmd $ O.progDesc "Interactively process new documents", O.command "view" . O.info viewCmd $ O.progDesc "View document(s)", O.command "topwords" . O.info topWordsCmd $ O.progDesc "View probability map per tag", O.command "modify" . O.info modifyCmd $ O.progDesc "Modify document(s)" ] consumeCmd :: O.Parser Cmd consumeCmd = Consume <$> keepArg <*> inputsArg <*> promptArg <*> forceArg <*> languageArg <*> autoArg editCmd :: O.Parser Cmd editCmd = Edit <$> indexNamesArg listCmd :: O.Parser Cmd listCmd = List <$> filtersArg <*> todoArg <*> viewArg <*> redoArg <*> editArg todoCmd :: O.Parser Cmd todoCmd = pure Todo viewCmd :: O.Parser Cmd viewCmd = View <$> indexNamesArg topWordsCmd :: O.Parser Cmd topWordsCmd = pure TopWords modifyCmd :: O.Parser Cmd modifyCmd = Modify <$> indexNamesArg <*> tagsArg <*> untagsArg inputsArg :: O.Parser [FilePath] inputsArg = O.many ( O.strArgument ( O.metavar "FILE" <> O.action "file" ) ) keepArg :: O.Parser Bool keepArg = O.switch ( O.long "keep" <> O.short 'k' <> O.help "Keep input document" ) promptArg :: O.Parser Bool promptArg = O.switch ( O.long "prompt" <> O.short 'p' <> O.help "Prompt after consuming document(s)" ) forceArg :: O.Parser Bool forceArg = O.switch ( O.long "force" <> O.short 'f' <> O.help "Force operation, overriding safety checks" ) autoArg :: O.Parser Bool autoArg = O.switch ( O.long "auto" <> O.short 'a' <> O.help "Automatically tag document(s)" ) filtersArg :: O.Parser [G.Filter] filtersArg = O.many $ O.option (O.eitherReader (A.parseOnly G.filterParser . T.pack)) ( O.long "filter" <> O.short 'f' <> O.help "Filter documents by tag" ) tagsArg :: O.Parser [G.Tag] tagsArg = O.many $ O.option (O.eitherReader (A.parseOnly G.tagParser . T.pack)) ( O.long "tag" <> O.help "Tag to add" ) untagsArg :: O.Parser [G.Tag] untagsArg = O.many $ O.option (O.eitherReader (A.parseOnly G.tagParser . T.pack)) ( O.long "untag" <> O.help "Tag to remove" ) languageArg :: O.Parser (Maybe String) languageArg = O.optional ( O.strOption ( O.long "language" <> O.short 'l' <> O.help "Specify document language" ) ) indexNamesArg :: O.Parser [FilePath] indexNamesArg = O.many $ O.strArgument ( O.metavar "ID" <> O.action "file" ) todoArg :: O.Parser Bool todoArg = O.switch ( O.long "todo" <> O.help "Run command `todo` on listed document(s)" ) redoArg :: O.Parser Bool redoArg = O.switch ( O.long "redo" <> O.help "Redo OCR on listed document(s)" ) editArg :: O.Parser Bool editArg = O.switch ( O.long "edit" <> O.help "Run command `edit` on listed document(s)" ) viewArg :: O.Parser Bool viewArg = O.switch ( O.long "view" <> O.help "Run command `view` on listed document(s)" ) main :: IO () main = do settings <- S.readSettings S.writeSettings "apaperless.yaml" settings cwd <- getCurrentDirectory setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR" ensureGit ensureDir "originals" ensureDir "index" O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case Args {cmd = Consume {keep, inputs, force, prompt, language, auto}} -> do indexNames <- parMapM ( consume1 (fromMaybe settings.defaultLanguage language) force keep ) (map (cwd ) inputs) docs <- mapM (readDocument . (<.> "json")) indexNames allDocs <- getDocuments docs' <- if | auto -> processDocuments settings allDocs docs -- TODO adding tags interactively through prompt does not persist them in store | prompt -> processDocumentsInteractively settings allDocs docs | otherwise -> pure docs mapM_ ( \doc -> do printf "%s\n" (takeBaseName doc.iFileName) printTags doc ) docs' Args {cmd = Edit {indexNames}} -> do editDocuments =<< mapM (readDocument . (<.> "json")) indexNames Args {cmd = List {filters, redo, todo = False, view = False, edit = False}} -> do doRedoIf filters redo mapM_ ( \doc -> do printf "%s\n" (takeBaseName doc.iFileName) printTags doc ) . filter (G.applyFilters filters . (.index.tags)) =<< getDocuments Args {cmd = List {filters, redo, edit = True}} -> do doRedoIf filters redo editDocuments . filter (G.applyFilters filters . (.index.tags)) =<< getDocuments Args {cmd = List {filters, redo, todo = True}} -> do doRedoIf filters redo allDocs <- getDocuments _ <- processDocumentsInteractively settings allDocs . filter (G.applyFilters filters . (.index.tags)) $ allDocs pure () Args {cmd = Todo} -> do allDocs <- getDocuments _ <- processDocumentsInteractively settings allDocs . filter (G.applyFilters [G.filter G.include "todo" Nothing] . (.index.tags)) $ allDocs pure () Args {cmd = List {filters, redo, view = True}} -> do doRedoIf filters redo viewDocuments . filter (G.applyFilters filters . (.index.tags)) =<< getDocuments Args {cmd = View {indexNames}} -> do viewDocuments =<< mapM (readDocument . (<.> "json")) indexNames Args {cmd = TopWords} -> do allDocs <- getDocuments mapM_ ( \(tag, xs) -> do print tag mapM_ (\(word, p) -> printf " %s: %.4f\n" word p) xs ) $ map (second (sortBy (comparing (negate . snd)))) $ map (second (filter ((> 0) . snd))) $ M.toList $ foldl ( \wordProbabilityPerTag (word, tag, p) -> M.insertWith (++) tag [(word, p)] wordProbabilityPerTag ) M.empty (probabilityMap allDocs) Args {cmd = Modify {indexNames, tags, untags}} -> do docs <- mapM (readDocument . (<.> "json")) indexNames mapM_ ( \doc -> do let doc' = addTags tags (removeTags untags doc) withGit do J.encodeFile doc'.iFilePath doc'.index commitAll [doc.iFilePath] (printf "tag %s" (takeBaseName doc.iFilePath)) ) docs printTags :: Document -> IO () printTags doc = mapM_ ( \tag -> case G.tagValue tag of Nothing -> printf "@%s\n" (G.tagKey tag) Just tagValue -> printf "@%s %s\n" (G.tagKey tag) tagValue ) ( filter ( \tag -> -- TODO Handle tags hidden by default -- -- - Add `list-default-hide-tags` to `Settings` -- - Add `--show-tags`, `--hide-tags` to `List` not ("paperless-ngx." `T.isPrefixOf` G.tagKey tag) && (G.tagKey tag /= "content") && (G.tagKey tag /= "importedFrom") && (G.tagKey tag /= "language") ) . S.toList $ doc.index.tags `S.union` doc.index.internalTags ) doRedoIf :: [G.Filter] -> Bool -> IO () doRedoIf filters redo = when redo do parMapM_ doRedo . filter (G.applyFilters filters . (.index.tags)) =<< getDocuments where doRedo doc = do content <- ocr doc.index.language doc.oFilePath withGit do J.encodeFile doc.iFilePath doc.index {content = content} commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) data Document = Document { iFileName :: String, index :: Index } deriving (Show) instance HasField "oFilePath" Document FilePath where getField doc = "originals" takeBaseName doc.iFileName <.> "pdf" instance HasField "iFilePath" Document FilePath where getField doc = "index" doc.iFileName hasWord :: T.Text -> Document -> Bool hasWord word doc = S.member word doc.index.originalWords hasTag :: G.Tag -> Document -> Bool hasTag tag doc = S.member tag doc.index.tags hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) tagValues docs = M.unionsWith S.union $ mapMaybe ( \tag -> M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag) ) (S.toList (S.unions (map (.index.tags) docs))) getDocuments :: IO [Document] getDocuments = parMapM readDocument =<< sort <$> listDirectory "index" readDocument :: FilePath -> IO Document readDocument iFileName = Document iFileName <$> decodeFile @Index ("index" iFileName) processDocumentsInteractively :: S.Settings -> [Document] -> [Document] -> IO [Document] processDocumentsInteractively settings allDocs docs = mapM (uncurry processDocumentInteractively) (zip [1 :: Int ..] docs) where numDocs = length docs processDocumentInteractively n doc = do choice <- P.prompt ( P.choice ( printf "%s\n%s\n\n(%d/%d) Process this document?" (takeBaseName doc.iFileName) doc.index.shortText n numDocs ) (("f" :: String) N.:| ["p", "s", "v"]) & P.help ( \a -> case a of "f" -> "view full text" "p" -> "process document" "s" -> "skip document" "v" -> "view document" ) ) case choice of "f" -> do printf "%s\n" (takeBaseName doc.iFileName) printf "%s\n" doc.index.content processDocumentInteractively n doc "p" -> tagDocumentInteractively settings allDocs doc "s" -> pure doc "v" -> do viewDocuments [doc] processDocumentInteractively n doc viewDocuments :: [Document] -> IO () viewDocuments docs = sh_ ( "zathura " <> ( intercalate " " (map (\doc -> printf "'%s'" doc.oFilePath) docs) ) ) editDocuments :: [Document] -> IO () editDocuments docs = withSystemTempDirectory "apaperless" $ \tmp -> do let fp doc = tmp takeBaseName doc.iFileName <.> "txt" mapM_ ( \doc -> T.writeFile (fp doc) doc.index.content ) docs sh_ ( "vim " <> ( intercalate " " (map (\doc -> printf "'%s'" (fp doc)) docs) ) ) mapM_ ( \doc -> do content <- T.readFile (fp doc) withGit do J.encodeFile doc.iFilePath doc.index {content = content} commitAll [doc.iFilePath] (printf "edit %s" (takeBaseName doc.iFilePath)) pure content ) docs type ProbabilityMap = [(T.Text, G.Tag, Double)] -- TODO Consider words that contribute to NOT adding a tag -- -- If there is a document that should, say, not have a `@correspondent`, we should score words that contribute to that fact as well. -- -- @topic probability-map probabilityMap :: [Document] -> ProbabilityMap probabilityMap allDocs = [ let docsWithWord = filter (hasWord word) allDocs docsWithTag = filter (hasTag tag) allDocs docsWithWordAndTag = filter (hasWordAndTag word tag) allDocs p = fi (length docsWithWordAndTag) / fi ( length docsWithWord + length docsWithTag - length docsWithWordAndTag ) in (word, tag, p) | word <- S.toList allWords, tag <- S.toList allTags ] where allTags = foldl S.union S.empty (map (.index.tags) allDocs) allWords = foldl S.union S.empty (map (.index.originalWords) allDocs) fi = fromIntegral @Int @Double suggestTags :: S.Settings -> [Document] -> Document -> IO [(G.Tag, [G.Tag])] suggestTags settings allDocs doc = do forM settings.suggestedTags $ \suggestedTag -> do case suggestedTag of S.SuggestTagByRE tagName pattern -> do let tagValues = nub . catMaybes . map R.matchedText . R.allMatches $ doc.index.content R.*=~ pattern pure (G.tag tagName (Just ""), map (G.tag tagName . Just) tagValues) S.SuggestTagByTags tagName -> do let tagValues = -- TODO Cache `probabilityMap` -- -- @topic probability-map probabilityMap allDocs & filter ( \(word, tag, _) -> G.tagKey tag == tagName && hasWord word doc ) & foldl' ( \scorePerTagValue (_, tag, p) -> M.insertWith (+) (G.tagValue tag) p scorePerTagValue ) M.empty & M.toList & sortBy (comparing (negate . snd)) & map fst pure (G.tag tagName (Just ""), map (G.tag tagName) tagValues) autoApplySuggestedTags :: [(G.Tag, [G.Tag])] -> [G.Tag] autoApplySuggestedTags = mapMaybe ( \(_, suggestedTags) -> if null suggestedTags then Nothing else Just (head suggestedTags) ) processDocuments :: S.Settings -> [Document] -> [Document] -> IO [Document] processDocuments settings allDocs docs = mapM processDocument docs where processDocument doc = do tags <- S.fromList . autoApplySuggestedTags <$> suggestTags settings allDocs doc let doc' = doc { index = doc.index { tags = doc.index.tags `S.union` tags, todo = False } } withGit do J.encodeFile doc.iFilePath doc'.index commitAll [doc.iFilePath] (printf "process %s (auto)" doc.iFilePath) pure doc' applyTags :: [Either T.Text G.Tag] -> Document -> Document applyTags tags' doc = do addTags tags (removeTags untags doc) where (untags, tags) = first (map (\tagKey -> G.tag tagKey Nothing)) $ partitionEithers tags' data AddTagException = AddTagException G.Tag deriving (Show) instance Exception AddTagException addTags :: [G.Tag] -> Document -> Document addTags = flip (foldl (flip addTag)) addTag :: G.Tag -> Document -> Document addTag tag doc = if | G.tagKey tag == "todo" -> doc {index = doc.index {todo = True}} | G.tagKey tag == "addedAt" -> maybe (throw (AddTagException tag)) ( \addedAt -> doc {index = doc.index {addedAt = addedAt}} ) (iso8601ParseM . T.unpack =<< (G.tagValue tag)) | G.tagKey tag == "modifiedAt" -> maybe (throw (AddTagException tag)) ( \modifiedAt -> doc {index = doc.index {modifiedAt = Just modifiedAt}} ) (iso8601ParseM . T.unpack =<< (G.tagValue tag)) | G.tagKey tag == "content" -> maybe (throw (AddTagException tag)) ( \content -> doc {index = doc.index {content = content}} ) (G.tagValue tag) | G.tagKey tag == "language" -> throw (AddTagException tag) | otherwise -> doc { index = doc.index { tags = G.replace tag doc.index.tags } } data RemoveTagException = RemoveTagException G.Tag deriving (Show) instance Exception RemoveTagException removeTags :: [G.Tag] -> Document -> Document removeTags = flip (foldl (flip removeTag)) removeTag :: G.Tag -> Document -> Document removeTag tag doc = if | G.tagKey tag == "todo" -> doc {index = doc.index {todo = False}} | G.tagKey tag == "addedAt" -> throw (RemoveTagException tag) | G.tagKey tag == "modifiedAt" -> throw (RemoveTagException tag) | G.tagKey tag == "language" -> throw (RemoveTagException tag) | G.tagKey tag == "content" -> throw (RemoveTagException tag) | otherwise -> doc { index = doc.index { tags = maybe (G.deleteAll (G.tagKey tag)) (\_ -> G.delete tag) (G.tagValue tag) doc.index.tags } } tagDocumentInteractively :: S.Settings -> [Document] -> Document -> IO Document tagDocumentInteractively settings allDocs doc = do suggestedTags <- suggestTags settings allDocs doc tags <- mapM (uncurry tagDocumentInteractively') suggestedTags let doc' = (applyTags tags doc) { index = doc.index {todo = False} } withGit do J.encodeFile doc.iFilePath doc'.index commitAll [doc.iFilePath] (printf "process %s (interactive)" doc.iFilePath) pure doc' where tagDocumentInteractively' :: G.Tag -> [G.Tag] -> IO (Either T.Text G.Tag) tagDocumentInteractively' tag tags | Nothing <- G.tagValue tag = do choice <- P.prompt $ P.choice (printf "tag with %s?" (G.tagKey tag)) (("n" :: String) N.:| ["y"]) pure $ if (choice == "y") then Right tag else Left (G.tagKey tag) | Just _ <- G.tagValue tag = do tagValue <- fmap T.pack . P.prompt $ P.string (printf "tag with %s?" (G.tagKey tag)) (mapMaybe (fmap T.unpack . G.tagValue) tags ++ ["-"]) pure $ if tagValue == "-" then Left (G.tagKey tag) else Right tag ensureGit :: IO () ensureGit = do doesExist <- doesDirectoryExist ".git" unless doesExist $ sh_ ">/dev/null git init --initial-branch main" ensureDir :: FilePath -> IO () ensureDir dirName = createDirectoryIfMissing False dirName debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x fileKey :: FilePath -> IO FilePath fileKey filePath = showDigest . sha1 <$> LB.readFile filePath consume1 :: String -> Bool -> Bool -> FilePath -> IO FilePath consume1 language force keep filePath = do fKey <- fileKey filePath let oFilePath = "originals" fKey <.> takeExtension filePath originalExists <- doesFileExist oFilePath when (originalExists && not force) do error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath) let iFilePath = "index" fKey <.> "json" content <- do content' <- T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext -layout '%s' -" filePath) let hasText = (not . T.null) . T.strip $ content' if not hasText then ocr language filePath else pure content' addedAt <- getCurrentTime withGit do J.encodeFile iFilePath Index { tags = S.empty, todo = True, modifiedAt = Nothing, .. } if keep then copyFile filePath oFilePath else renameFile filePath oFilePath commitAll [iFilePath, oFilePath] (printf "add %s" (takeFileName filePath)) pure (takeBaseName iFilePath) withGit :: IO a -> IO a withGit = withLockFile def ".gitlock" commitAll :: [FilePath] -> String -> IO () commitAll fps m = do sh_ (">/dev/null git add -- " ++ intercalate " " (map (printf "'%s'") fps)) sh_ (printf ">/dev/null git commit -m '%s' || :" m) data DecodeException = DecodeException FilePath String deriving (Show) instance Exception DecodeException decodeFile :: J.FromJSON a => FilePath -> IO a decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp ocr :: String -> FilePath -> IO T.Text ocr language input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) let isScan = length pdfImages == pdfInfo.numPages && all ((pdfInfo.pageSize ==) . imageSize) pdfImages if isScan then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) imageFiles <- sort <$> listDirectory tmp -- XXX add DPI information to image so that resulting pdf preserves DPI parMapM_ ( \(pdfImage, imageFile) -> sh_ ( printf "convert -density %dx%d -units PixelsPerInch '%s' '%s'" pdfImage.xPpi pdfImage.yPpi (tmp imageFile) (tmp imageFile) ) ) (zip pdfImages imageFiles) T.unlines <$> mapM (ocr1 language tmp . (tmp )) imageFiles ocr1 :: String -> FilePath -> FilePath -> IO T.Text ocr1 language tmp input = T.decodeUtf8 . LB.toStrict -- XXX `--oem 1` seems to be unavailable <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp input) language) data Index = Index { content :: T.Text, tags :: S.Set G.Tag, addedAt :: UTCTime, modifiedAt :: Maybe UTCTime, todo :: Bool, language :: String } deriving (Show, Generic, Eq) instance J.ToJSON Index instance J.FromJSON Index instance HasField "shortText" Index T.Text where getField = (T.unlines . take 10 . T.lines) . (.content) instance HasField "internalTags" Index (S.Set G.Tag) where getField index = index.tags `S.union` internalTags index -- TODO Cache `originalWords` -- -- @related cache-probabilitymap instance HasField "originalWords" Index (S.Set T.Text) where getField index = S.fromList (T.words index.content) internalTags :: Index -> S.Set G.Tag internalTags index = S.fromList ( concat [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), G.tag "language" (Just (T.pack index.language)) ], maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt), if index.todo then [G.tag "todo" Nothing] else [] ] ) data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) } deriving (Show) data PdfInfoException = PdfInfoException deriving (Show) instance Exception PdfInfoException parsePdfInfo :: LB.ByteString -> PdfInfo parsePdfInfo out' = fromMaybe (throw PdfInfoException) $ do numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs pageSize <- rightToMaybe . A.parseOnly pageSizeParser =<< M.lookup "Page size" kvs pure PdfInfo {..} where out = T.decodeUtf8 (LB.toStrict out') kvs = M.fromList . map (second T.stripStart) . map (second T.tail . T.break (== ':')) . filter (not . T.null) . T.lines $ out pageSizeParser = (,) <$> (A.double <* A.string " x ") <*> (A.double <* A.string " pts (A4)") <* A.endOfInput type PdfImages = [PdfImage] data PdfImage = PdfImage { page :: Int, num :: Int, type_ :: String, width :: Int, height :: Int, color :: String, comp :: Int, bpc :: Int, enc :: String, interp :: String, object :: Int, id :: Int, xPpi :: Int, yPpi :: Int, size :: String, ratio :: String } deriving (Show) imageSize :: PdfImage -> (Double, Double) imageSize (PdfImage {..}) = let f ppi p = 72 * fromIntegral p / fromIntegral ppi in (f xPpi width, f yPpi height) data PdfImagesException = PdfImagesException deriving (Show) instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException parsePdfImages :: LB.ByteString -> PdfImages parsePdfImages out' = map ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> PdfImage { page = read page', num = read num', width = read width', height = read height', comp = read comp', bpc = read bpc', object = read object', id = read id', xPpi = read xPpi', yPpi = read yPpi', .. } ) . map (map T.unpack) . map T.words . drop 2 . filter (not . T.null) . T.lines $ out where out = T.decodeUtf8 (LB.toStrict out') sh :: String -> IO LB.ByteString sh cmd = do -- printf "+ %s\n" cmd (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err sh_ :: String -> IO () sh_ cmd = do -- printf "+ %s\n" cmd (exitCode, err) <- readProcessStderr (fromString cmd) case exitCode of ExitSuccess -> return () ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just parMapM :: (a -> IO b) -> [a] -> IO [b] parMapM f xs = do n <- getNumProcessors withPool n $ \pool -> parallel pool (map f xs) parMapM_ :: (a -> IO b) -> [a] -> IO () parMapM_ f = fmap (const ()) . parMapM f