summaryrefslogtreecommitdiffstats
path: root/app
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-20 15:10:34 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-20 15:10:34 +0100
commit1b574f472e6ca052db3ab5041e2fc1ab9923f163 (patch)
tree27016f165ad0bef99ece1b2b86b2da200d575052 /app
chore: init
Diffstat (limited to 'app')
-rw-r--r--app/Main.hs52
1 files changed, 52 insertions, 0 deletions
diff --git a/app/Main.hs b/app/Main.hs
new file mode 100644
index 0000000..075414a
--- /dev/null
+++ b/app/Main.hs
@@ -0,0 +1,52 @@
+module Main where
+
+import Control.Exception (Exception, throwIO)
+import Data.ByteString.Lazy qualified as LB
+import Data.List
+import Data.String (IsString (fromString))
+import System.Directory
+import System.FilePath
+import System.IO.Temp (withSystemTempDirectory)
+import System.Process.Typed
+import Text.Printf (printf)
+
+main :: IO ()
+main = do
+ let input = "0000001.pdf"
+
+ withSystemTempDirectory input $ \tmp -> do
+ sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp </> input))
+ imageInputs <- sort <$> listDirectory tmp
+ outputs <-
+ mapM
+ ( \imageInput -> do
+ sh_
+ ( printf
+ "tesseract '%s' '%s' pdf -psm 1 -oem 1"
+ (tmp </> imageInput)
+ (tmp </> imageInput)
+ )
+ pure (imageInput <.> ".pdf")
+ )
+ imageInputs
+ sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) outputs ++ [printf "'%s'" (tmp </> input)]))
+ copyFile input (input <.> "bak")
+ copyFile (tmp </> input) ("." <> input)
+ renameFile ("." <> input) input
+ LB.putStr =<< sh (printf "pdftotext '%s' -" input)
+
+data ProcessException = ProcessException Int LB.ByteString
+ deriving (Show)
+
+instance Exception ProcessException
+
+sh :: String -> IO LB.ByteString
+sh cmd = do
+ -- printf "+ %s\n" cmd
+ (exitCode, out, err) <- readProcess (fromString cmd)
+ case exitCode of
+ ExitSuccess -> return out
+ ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err
+
+sh_ :: String -> IO ()
+sh_ = fmap (\_ -> ()) . sh