diff options
Diffstat (limited to 'app/Main.hs')
-rw-r--r-- | app/Main.hs | 52 |
1 files changed, 37 insertions, 15 deletions
diff --git a/app/Main.hs b/app/Main.hs index a9d6ca5..156af1d 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -62,7 +62,8 @@ data Cmd { keep :: Bool, inputs :: [FilePath], prompt :: Bool, - force :: Bool + force :: Bool, + language :: Maybe String } | Edit { indexNames :: [FilePath] @@ -105,6 +106,7 @@ consumeCmd = <*> inputsArg <*> promptArg <*> forceArg + <*> languageArg editCmd :: O.Parser Cmd editCmd = @@ -176,6 +178,16 @@ filtersArg = parse ('!' : '@' : tagKey) = Just (Filter Exclude (FilterByTag (T.pack tagKey))) parse _ = Nothing +languageArg :: O.Parser (Maybe String) +languageArg = + O.optional + ( O.strOption + ( O.long "language" + <> O.short 'l' + <> O.help "Specify document language" + ) + ) + indexNamesArg :: O.Parser [FilePath] indexNamesArg = O.many $ @@ -231,8 +243,15 @@ main = do ensureDir "index" O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case - Args {cmd = Consume {keep, inputs, force, prompt}} -> do - indexNames <- parMapM (consume1 force keep) (map (cwd </>) inputs) + Args {cmd = Consume {keep, inputs, force, prompt, language}} -> do + indexNames <- + parMapM + ( consume1 + (fromMaybe settings.defaultLanguage language) + force + keep + ) + (map (cwd </>) inputs) allDocs <- getDocuments docs <- mapM (readDocument . (<.> "json")) indexNames docs' <- @@ -304,7 +323,7 @@ doRedoIf filters redo = =<< getDocuments where doRedo doc = do - originalText <- ocr doc.oFilePath + originalText <- ocr doc.index.language doc.oFilePath withGit do J.encodeFile doc.iFilePath doc.index {originalText = originalText} commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) @@ -563,8 +582,8 @@ fileKey :: FilePath -> IO FilePath fileKey filePath = showDigest . sha1 <$> LB.readFile filePath -consume1 :: Bool -> Bool -> FilePath -> IO FilePath -consume1 force keep filePath = do +consume1 :: String -> Bool -> Bool -> FilePath -> IO FilePath +consume1 language force keep filePath = do fKey <- fileKey filePath let oFilePath = "originals" </> fKey <.> takeExtension filePath originalExists <- doesFileExist oFilePath @@ -577,7 +596,7 @@ consume1 force keep filePath = do <$> sh (printf "pdftotext -layout '%s' -" filePath) let hasText = (not . T.null) . T.strip $ originalText' if not hasText - then ocr filePath + then ocr language filePath else pure originalText' addedAt <- getCurrentTime withGit do @@ -608,8 +627,8 @@ decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp -ocr :: FilePath -> IO T.Text -ocr input = +ocr :: String -> FilePath -> IO T.Text +ocr language input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) @@ -634,19 +653,20 @@ ocr input = ) ) (zip pdfImages imageFiles) - T.unlines <$> mapM (ocr1 tmp . (tmp </>)) imageFiles + T.unlines <$> mapM (ocr1 language tmp . (tmp </>)) imageFiles -ocr1 :: FilePath -> FilePath -> IO T.Text -ocr1 tmp input = +ocr1 :: String -> FilePath -> FilePath -> IO T.Text +ocr1 language tmp input = T.decodeUtf8 . LB.toStrict -- XXX `--oem 1` seems to be unavailable - <$> sh (printf "tesseract '%s' - -l deu+eng --oem 3 --psm 1" (tmp </> input)) + <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language) data Index = Index { originalText :: T.Text, tags :: S.Set Tag, addedAt :: UTCTime, - todo :: Bool + todo :: Bool, + language :: String } deriving (Show, Generic, Eq) @@ -675,7 +695,9 @@ internalTags :: Index -> S.Set Tag internalTags index = S.fromList ( concat - [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))], + [ [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), + Tag "language" (Just (T.pack index.language)) + ], if index.todo then [Tag "todo" Nothing] else [] ] ) |