From d5d96dba1ff5c9cd66c295665fb422c8e930ff9d Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Fri, 22 Dec 2023 04:03:47 +0100 Subject: fix: fix `originalText` --- app/Main.hs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/app/Main.hs b/app/Main.hs index 4ee4913..920d3a6 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -23,6 +23,7 @@ import Data.Maybe (fromMaybe) import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T +import Data.Text.IO qualified as T import Debug.Trace import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) @@ -91,7 +92,11 @@ main = do mapM_ putStrLn =<< parMapM (consume1 keep) (map (cwd ) filePaths) Args {cmd = List} -> do - mapM_ (putStrLn . takeBaseName . fst) + mapM_ + ( \(iFileName, index) -> do + putStrLn (takeBaseName iFileName) + T.putStrLn index.originalText + ) =<< parMapM ( \iFileName -> (,) iFileName <$> decodeFile @Index ("index" iFileName) @@ -188,14 +193,9 @@ ocr input = T.unlines <$> mapM (ocr1 tmp . (tmp )) imageFiles ocr1 :: FilePath -> FilePath -> IO T.Text -ocr1 tmp input = do +ocr1 tmp input = T.decodeUtf8 . LB.toStrict - <$> sh - ( printf - "tesseract '%s' '%s' pdf" - (tmp input) - (tmp takeBaseName input) - ) + <$> sh (printf "tesseract '%s' -" (tmp input)) data Index = Index { originalText :: T.Text -- cgit v1.2.3