diff options
author | Alexander Foremny <aforemny@posteo.de> | 2023-12-26 06:03:31 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2023-12-26 06:03:31 +0100 |
commit | 7b30bdd8763da0c8e9dd5c9464f9aca0a8d000ed (patch) | |
tree | e1c803e5cf5c787f19fd8e2df53d87422315c32e | |
parent | f83bec1d304250b0af5a7fba60770d7250d05eea (diff) |
chore: add `--language`
-rw-r--r-- | apaperless.yaml | 1 | ||||
-rw-r--r-- | app/Main.hs | 52 | ||||
-rw-r--r-- | app/Settings.hs | 9 |
3 files changed, 44 insertions, 18 deletions
diff --git a/apaperless.yaml b/apaperless.yaml index f6e63a7..ed74742 100644 --- a/apaperless.yaml +++ b/apaperless.yaml @@ -1,3 +1,4 @@ +defaultLanguage: deu+eng suggestedTags: - contents: - createdAt diff --git a/app/Main.hs b/app/Main.hs index a9d6ca5..156af1d 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -62,7 +62,8 @@ data Cmd { keep :: Bool, inputs :: [FilePath], prompt :: Bool, - force :: Bool + force :: Bool, + language :: Maybe String } | Edit { indexNames :: [FilePath] @@ -105,6 +106,7 @@ consumeCmd = <*> inputsArg <*> promptArg <*> forceArg + <*> languageArg editCmd :: O.Parser Cmd editCmd = @@ -176,6 +178,16 @@ filtersArg = parse ('!' : '@' : tagKey) = Just (Filter Exclude (FilterByTag (T.pack tagKey))) parse _ = Nothing +languageArg :: O.Parser (Maybe String) +languageArg = + O.optional + ( O.strOption + ( O.long "language" + <> O.short 'l' + <> O.help "Specify document language" + ) + ) + indexNamesArg :: O.Parser [FilePath] indexNamesArg = O.many $ @@ -231,8 +243,15 @@ main = do ensureDir "index" O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case - Args {cmd = Consume {keep, inputs, force, prompt}} -> do - indexNames <- parMapM (consume1 force keep) (map (cwd </>) inputs) + Args {cmd = Consume {keep, inputs, force, prompt, language}} -> do + indexNames <- + parMapM + ( consume1 + (fromMaybe settings.defaultLanguage language) + force + keep + ) + (map (cwd </>) inputs) allDocs <- getDocuments docs <- mapM (readDocument . (<.> "json")) indexNames docs' <- @@ -304,7 +323,7 @@ doRedoIf filters redo = =<< getDocuments where doRedo doc = do - originalText <- ocr doc.oFilePath + originalText <- ocr doc.index.language doc.oFilePath withGit do J.encodeFile doc.iFilePath doc.index {originalText = originalText} commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath)) @@ -563,8 +582,8 @@ fileKey :: FilePath -> IO FilePath fileKey filePath = showDigest . sha1 <$> LB.readFile filePath -consume1 :: Bool -> Bool -> FilePath -> IO FilePath -consume1 force keep filePath = do +consume1 :: String -> Bool -> Bool -> FilePath -> IO FilePath +consume1 language force keep filePath = do fKey <- fileKey filePath let oFilePath = "originals" </> fKey <.> takeExtension filePath originalExists <- doesFileExist oFilePath @@ -577,7 +596,7 @@ consume1 force keep filePath = do <$> sh (printf "pdftotext -layout '%s' -" filePath) let hasText = (not . T.null) . T.strip $ originalText' if not hasText - then ocr filePath + then ocr language filePath else pure originalText' addedAt <- getCurrentTime withGit do @@ -608,8 +627,8 @@ decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp -ocr :: FilePath -> IO T.Text -ocr input = +ocr :: String -> FilePath -> IO T.Text +ocr language input = withSystemTempDirectory (takeBaseName input) $ \tmp -> do let fn suffix = tmp </> takeBaseName input <> suffix pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) @@ -634,19 +653,20 @@ ocr input = ) ) (zip pdfImages imageFiles) - T.unlines <$> mapM (ocr1 tmp . (tmp </>)) imageFiles + T.unlines <$> mapM (ocr1 language tmp . (tmp </>)) imageFiles -ocr1 :: FilePath -> FilePath -> IO T.Text -ocr1 tmp input = +ocr1 :: String -> FilePath -> FilePath -> IO T.Text +ocr1 language tmp input = T.decodeUtf8 . LB.toStrict -- XXX `--oem 1` seems to be unavailable - <$> sh (printf "tesseract '%s' - -l deu+eng --oem 3 --psm 1" (tmp </> input)) + <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language) data Index = Index { originalText :: T.Text, tags :: S.Set Tag, addedAt :: UTCTime, - todo :: Bool + todo :: Bool, + language :: String } deriving (Show, Generic, Eq) @@ -675,7 +695,9 @@ internalTags :: Index -> S.Set Tag internalTags index = S.fromList ( concat - [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))], + [ [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), + Tag "language" (Just (T.pack index.language)) + ], if index.todo then [Tag "todo" Nothing] else [] ] ) diff --git a/app/Settings.hs b/app/Settings.hs index 39e8e35..f4c5811 100644 --- a/app/Settings.hs +++ b/app/Settings.hs @@ -23,14 +23,16 @@ import System.Environment.XDG.BaseDir (getSystemConfigFiles, getUserConfigFile) import Text.RE.TDFA.Text qualified as R data Settings = Settings - { suggestedTags :: [SuggestedTag] + { suggestedTags :: [SuggestedTag], + defaultLanguage :: String } deriving (Show, Generic) instance Semigroup Settings where a <> b = Settings - { suggestedTags = nub (a.suggestedTags <> b.suggestedTags) + { suggestedTags = nub (a.suggestedTags <> b.suggestedTags), + defaultLanguage = b.defaultLanguage } instance Monoid Settings where @@ -45,7 +47,8 @@ instance Monoid Settings where -- - perform minor corrections on OCR, ie. parse `0 1.01.1970` SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|], SuggestTagByTags "correspondent" - ] + ], + defaultLanguage = "deu+eng" } instance A.FromJSON Settings |