summaryrefslogtreecommitdiffstats
path: root/app/Main.hs
blob: 075414a57419828599ba294d59559251e47e03d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
module Main where

import Control.Exception (Exception, throwIO)
import Data.ByteString.Lazy qualified as LB
import Data.List
import Data.String (IsString (fromString))
import System.Directory
import System.FilePath
import System.IO.Temp (withSystemTempDirectory)
import System.Process.Typed
import Text.Printf (printf)

main :: IO ()
main = do
  let input = "0000001.pdf"

  withSystemTempDirectory input $ \tmp -> do
    sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp </> input))
    imageInputs <- sort <$> listDirectory tmp
    outputs <-
      mapM
        ( \imageInput -> do
            sh_
              ( printf
                  "tesseract '%s' '%s' pdf -psm 1 -oem 1"
                  (tmp </> imageInput)
                  (tmp </> imageInput)
              )
            pure (imageInput <.> ".pdf")
        )
        imageInputs
    sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp </>)) outputs ++ [printf "'%s'" (tmp </> input)]))
    copyFile input (input <.> "bak")
    copyFile (tmp </> input) ("." <> input)
    renameFile ("." <> input) input
    LB.putStr =<< sh (printf "pdftotext '%s' -" input)

data ProcessException = ProcessException Int LB.ByteString
  deriving (Show)

instance Exception ProcessException

sh :: String -> IO LB.ByteString
sh cmd = do
  -- printf "+ %s\n" cmd
  (exitCode, out, err) <- readProcess (fromString cmd)
  case exitCode of
    ExitSuccess -> return out
    ExitFailure exitCode' -> throwIO $ ProcessException exitCode' err

sh_ :: String -> IO ()
sh_ = fmap (\_ -> ()) . sh