diff options
Diffstat (limited to 'app/Main.hs')
-rw-r--r-- | app/Main.hs | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/app/Main.hs b/app/Main.hs new file mode 100644 index 0000000..075414a --- /dev/null +++ b/app/Main.hs @@ -0,0 +1,52 @@ +module Main where + +import Control.Exception (Exception, throwIO) +import Data.ByteString.Lazy qualified as LB +import Data.List +import Data.String (IsString (fromString)) +import System.Directory +import System.FilePath +import System.IO.Temp (withSystemTempDirectory) +import System.Process.Typed +import Text.Printf (printf) + +main :: IO () +main = do + let input = "0000001.pdf" + + withSystemTempDirectory input $ \tmp -> do + sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp </> input)) + imageInputs <- sort <$> listDirectory tmp + outputs <- + mapM + ( \imageInput -> do + sh_ + ( printf + "tesseract '%s' '%s' pdf -psm 1 -oem 1" + (tmp </> imageInput) + (tmp </> imageInput) + ) + pure (imageInput <.> ".pdf") + ) + imageInputs + sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) outputs ++ [printf "'%s'" (tmp </> input)])) + copyFile input (input <.> "bak") + copyFile (tmp </> input) ("." <> input) + renameFile ("." <> input) input + LB.putStr =<< sh (printf "pdftotext '%s' -" input) + +data ProcessException = ProcessException Int LB.ByteString + deriving (Show) + +instance Exception ProcessException + +sh :: String -> IO LB.ByteString +sh cmd = do + -- printf "+ %s\n" cmd + (exitCode, out, err) <- readProcess (fromString cmd) + case exitCode of + ExitSuccess -> return out + ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err + +sh_ :: String -> IO () +sh_ = fmap (\_ -> ()) . sh |