blob: 075414a57419828599ba294d59559251e47e03d4 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
module Main where
import Control.Exception (Exception, throwIO)
import Data.ByteString.Lazy qualified as LB
import Data.List
import Data.String (IsString (fromString))
import System.Directory
import System.FilePath
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
import Text.Printf (printf)
main :: IO ()
main = do
let input = "0000001.pdf"
withSystemTempDirectory input $ \tmp -> do
sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp </> input))
imageInputs <- sort <$> listDirectory tmp
outputs <-
mapM
( \imageInput -> do
sh_
( printf
"tesseract '%s' '%s' pdf -psm 1 -oem 1"
(tmp </> imageInput)
(tmp </> imageInput)
)
pure (imageInput <.> ".pdf")
)
imageInputs
sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) outputs ++ [printf "'%s'" (tmp </> input)]))
copyFile input (input <.> "bak")
copyFile (tmp </> input) ("." <> input)
renameFile ("." <> input) input
LB.putStr =<< sh (printf "pdftotext '%s' -" input)
data ProcessException = ProcessException Int LB.ByteString
deriving (Show)
instance Exception ProcessException
sh :: String -> IO LB.ByteString
sh cmd = do
-- printf "+ %s\n" cmd
(exitCode, out, err) <- readProcess (fromString cmd)
case exitCode of
ExitSuccess -> return out
ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err
sh_ :: String -> IO ()
sh_ = fmap (\_ -> ()) . sh
|