{-# LANGUAGE BlockArguments #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE NoFieldSelectors #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module Main where import Control.Arrow (second) import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throw, throwIO) import Control.Monad (unless, when) import Data.Aeson qualified as J import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.Default import Data.Digest.Pure.SHA (sha256, showDigest) import Data.List import Data.Map qualified as M import Data.Maybe (fromMaybe) import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) import Options.Applicative qualified as O import System.Directory import System.Environment (getEnv) import System.FilePath import System.IO.LockFile (withLockFile) import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) import Text.Read (readMaybe) data Args = Args { cmd :: Cmd } data Cmd = Consume Bool [FilePath] | List args :: O.Parser Args args = Args <$> cmd cmd :: O.Parser Cmd cmd = O.hsubparser . mconcat $ [ O.command "consume" . O.info consumeCmd $ O.progDesc "Consume document(s)", O.command "list" . O.info listCmd $ O.progDesc "List document(s)" ] consumeCmd :: O.Parser Cmd consumeCmd = Consume <$> keepArg <*> filePathsArg listCmd :: O.Parser Cmd listCmd = pure List filePathsArg :: O.Parser [FilePath] filePathsArg = O.many (O.strArgument (O.metavar "FILE" <> O.action "file")) keepArg :: O.Parser Bool keepArg = O.switch ( O.long "keep" <> O.help "Keep input document" ) main :: IO () main = do cwd <- getCurrentDirectory setCurrentDirectory =<< getEnv "APAPERLESS_STORE_DIR" ensureGit ensureDir "originals" ensureDir "index" O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case Args {cmd = Consume keep filePaths} -> mapM_ putStrLn =<< parMapM (consume1 keep) (map (cwd ) filePaths) Args {cmd = List} -> do mapM_ ( \(iFileName, index) -> do putStrLn (takeBaseName iFileName) T.putStrLn index.originalText ) =<< parMapM ( \iFileName -> (,) iFileName <$> decodeFile @Index ("index" iFileName) ) =<< sort <$> listDirectory "index" ensureGit :: IO () ensureGit = do doesExist <- doesDirectoryExist ".git" unless doesExist $ sh_ "git init --initial-branch main" ensureDir :: FilePath -> IO () ensureDir dirName = createDirectoryIfMissing False dirName debug :: Show a => String -> a -> a debug s x = trace (printf "%s: %s\n" s (show x)) x fileKey :: FilePath -> IO FilePath fileKey filePath = showDigest . sha256 <$> LB.readFile filePath consume1 :: Bool -> FilePath -> IO FilePath consume1 keep filePath = do fKey <- fileKey filePath let oFilePath = "originals" fKey <.> takeExtension filePath originalExists <- doesFileExist oFilePath when originalExists do error (printf "error: error adding %s: duplicate of %s\n" filePath oFilePath) let iFilePath = "index" fKey <.> "json" originalText <- do originalText' <- T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext '%s' -" filePath) let hasText = (not . T.null) . T.strip $ originalText' if not hasText then ocr filePath else pure originalText' withGit do J.encodeFile iFilePath Index {..} if keep then copyFile filePath oFilePath else renameFile filePath oFilePath commitAll [iFilePath, oFilePath] (printf "add %s" (takeFileName filePath)) pure (takeBaseName iFilePath) withGit :: IO a -> IO a withGit = withLockFile def ".gitlock" commitAll :: [FilePath] -> String -> IO () commitAll fps m = do sh_ ("git add -- " ++ intercalate " " (map (printf "'%s'") fps)) sh_ (printf "git commit -m '%s'" m) data DecodeException = DecodeException FilePath String deriving (Show) instance Exception DecodeException decodeFile :: J.FromJSON a => FilePath -> IO a decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp ocr :: FilePath -> IO T.Text ocr input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) let isScan = length pdfImages == pdfInfo.numPages && all ((pdfInfo.pageSize ==) . imageSize) pdfImages if isScan then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) imageFiles <- sort <$> listDirectory tmp -- XXX add DPI information to image so that resulting pdf preserves DPI parMapM_ ( \(pdfImage, imageFile) -> sh_ ( printf "convert -density %dx%d -units PixelsPerInch '%s' '%s'" pdfImage.xPpi pdfImage.yPpi (tmp imageFile) (tmp imageFile) ) ) (zip pdfImages imageFiles) T.unlines <$> mapM (ocr1 tmp . (tmp )) imageFiles ocr1 :: FilePath -> FilePath -> IO T.Text ocr1 tmp input = T.decodeUtf8 . LB.toStrict <$> sh (printf "tesseract '%s' -" (tmp input)) data Index = Index { originalText :: T.Text } deriving (Show, Generic, Eq) instance J.ToJSON Index instance J.FromJSON Index data PdfInfo = PdfInfo { numPages :: Int, pageSize :: (Double, Double) } deriving (Show) data PdfInfoException = PdfInfoException deriving (Show) instance Exception PdfInfoException parsePdfInfo :: LB.ByteString -> PdfInfo parsePdfInfo out' = fromMaybe (throw PdfInfoException) $ do numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs pageSize <- rightToMaybe . A.parseOnly pageSizeParser =<< M.lookup "Page size" kvs pure PdfInfo {..} where out = T.decodeUtf8 (LB.toStrict out') kvs = M.fromList . map (second T.stripStart) . map (second T.tail . T.break (== ':')) . filter (not . T.null) . T.lines $ out pageSizeParser = (,) <$> (A.double <* A.string " x ") <*> (A.double <* A.string " pts (A4)") <* A.endOfInput type PdfImages = [PdfImage] data PdfImage = PdfImage { page :: Int, num :: Int, type_ :: String, width :: Int, height :: Int, color :: String, comp :: Int, bpc :: Int, enc :: String, interp :: String, object :: Int, id :: Int, xPpi :: Int, yPpi :: Int, size :: String, ratio :: String } deriving (Show) imageSize :: PdfImage -> (Double, Double) imageSize (PdfImage {..}) = let f ppi p = 72 * fromIntegral p / fromIntegral ppi in (f xPpi width, f yPpi height) data PdfImagesException = PdfImagesException deriving (Show) instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException parsePdfImages :: LB.ByteString -> PdfImages parsePdfImages out' = map ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> PdfImage { page = read page', num = read num', width = read width', height = read height', comp = read comp', bpc = read bpc', object = read object', id = read id', xPpi = read xPpi', yPpi = read yPpi', .. } ) . map (map T.unpack) . map T.words . drop 2 . filter (not . T.null) . T.lines $ out where out = T.decodeUtf8 (LB.toStrict out') sh :: String -> IO LB.ByteString sh cmd = do (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err sh_ :: String -> IO () sh_ = fmap (\_ -> ()) . sh rightToMaybe :: Either e a -> Maybe a rightToMaybe = either (const Nothing) Just parMapM :: (a -> IO b) -> [a] -> IO [b] parMapM f xs = do n <- getNumProcessors withPool n $ \pool -> parallel pool (map f xs) parMapM_ :: (a -> IO b) -> [a] -> IO () parMapM_ f = fmap (const ()) . parMapM f