From 1b574f472e6ca052db3ab5041e2fc1ab9923f163 Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Wed, 20 Dec 2023 15:10:34 +0100 Subject: chore: init --- app/Main.hs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 app/Main.hs (limited to 'app') diff --git a/app/Main.hs b/app/Main.hs new file mode 100644 index 0000000..075414a --- /dev/null +++ b/app/Main.hs @@ -0,0 +1,52 @@ +module Main where + +import Control.Exception (Exception, throwIO) +import Data.ByteString.Lazy qualified as LB +import Data.List +import Data.String (IsString (fromString)) +import System.Directory +import System.FilePath +import System.IO.Temp (withSystemTempDirectory) +import System.Process.Typed +import Text.Printf (printf) + +main :: IO () +main = do + let input = "0000001.pdf" + + withSystemTempDirectory input $ \tmp -> do + sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp input)) + imageInputs <- sort <$> listDirectory tmp + outputs <- + mapM + ( \imageInput -> do + sh_ + ( printf + "tesseract '%s' '%s' pdf -psm 1 -oem 1" + (tmp imageInput) + (tmp imageInput) + ) + pure (imageInput <.> ".pdf") + ) + imageInputs + sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp )) outputs ++ [printf "'%s'" (tmp input)])) + copyFile input (input <.> "bak") + copyFile (tmp input) ("." <> input) + renameFile ("." <> input) input + LB.putStr =<< sh (printf "pdftotext '%s' -" input) + +data ProcessException = ProcessException Int LB.ByteString + deriving (Show) + +instance Exception ProcessException + +sh :: String -> IO LB.ByteString +sh cmd = do + -- printf "+ %s\n" cmd + (exitCode, out, err) <- readProcess (fromString cmd) + case exitCode of + ExitSuccess -> return out + ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err + +sh_ :: String -> IO () +sh_ = fmap (\_ -> ()) . sh -- cgit v1.2.3