module Main where import Control.Exception (Exception, throwIO) import Data.ByteString.Lazy qualified as LB import Data.List import Data.String (IsString (fromString)) import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) main :: IO () main = do let input = "0000001.pdf" withSystemTempDirectory input $ \tmp -> do sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp input)) imageInputs <- sort <$> listDirectory tmp outputs <- mapM ( \imageInput -> do sh_ ( printf "tesseract '%s' '%s' pdf -psm 1 -oem 1" (tmp imageInput) (tmp imageInput) ) pure (imageInput <.> ".pdf") ) imageInputs sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp )) outputs ++ [printf "'%s'" (tmp input)])) copyFile input (input <.> "bak") copyFile (tmp input) ("." <> input) renameFile ("." <> input) input LB.putStr =<< sh (printf "pdftotext '%s' -" input) data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException sh :: String -> IO LB.ByteString sh cmd = do -- printf "+ %s\n" cmd (exitCode, out, err) <- readProcess (fromString cmd) case exitCode of ExitSuccess -> return out ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err sh_ :: String -> IO () sh_ = fmap (\_ -> ()) . sh