summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-26 06:03:31 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-26 06:03:31 +0100
commit7b30bdd8763da0c8e9dd5c9464f9aca0a8d000ed (patch)
treee1c803e5cf5c787f19fd8e2df53d87422315c32e
parentf83bec1d304250b0af5a7fba60770d7250d05eea (diff)
chore: add `--language`
-rw-r--r--apaperless.yaml1
-rw-r--r--app/Main.hs52
-rw-r--r--app/Settings.hs9
3 files changed, 44 insertions, 18 deletions
diff --git a/apaperless.yaml b/apaperless.yaml
index f6e63a7..ed74742 100644
--- a/apaperless.yaml
+++ b/apaperless.yaml
@@ -1,3 +1,4 @@
+defaultLanguage: deu+eng
suggestedTags:
- contents:
- createdAt
diff --git a/app/Main.hs b/app/Main.hs
index a9d6ca5..156af1d 100644
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -62,7 +62,8 @@ data Cmd
{ keep :: Bool,
inputs :: [FilePath],
prompt :: Bool,
- force :: Bool
+ force :: Bool,
+ language :: Maybe String
}
| Edit
{ indexNames :: [FilePath]
@@ -105,6 +106,7 @@ consumeCmd =
<*> inputsArg
<*> promptArg
<*> forceArg
+ <*> languageArg
editCmd :: O.Parser Cmd
editCmd =
@@ -176,6 +178,16 @@ filtersArg =
parse ('!' : '@' : tagKey) = Just (Filter Exclude (FilterByTag (T.pack tagKey)))
parse _ = Nothing
+languageArg :: O.Parser (Maybe String)
+languageArg =
+ O.optional
+ ( O.strOption
+ ( O.long "language"
+ <> O.short 'l'
+ <> O.help "Specify document language"
+ )
+ )
+
indexNamesArg :: O.Parser [FilePath]
indexNamesArg =
O.many $
@@ -231,8 +243,15 @@ main = do
ensureDir "index"
O.execParser (O.info (args O.<**> O.helper) O.idm) >>= \case
- Args {cmd = Consume {keep, inputs, force, prompt}} -> do
- indexNames <- parMapM (consume1 force keep) (map (cwd </>) inputs)
+ Args {cmd = Consume {keep, inputs, force, prompt, language}} -> do
+ indexNames <-
+ parMapM
+ ( consume1
+ (fromMaybe settings.defaultLanguage language)
+ force
+ keep
+ )
+ (map (cwd </>) inputs)
allDocs <- getDocuments
docs <- mapM (readDocument . (<.> "json")) indexNames
docs' <-
@@ -304,7 +323,7 @@ doRedoIf filters redo =
=<< getDocuments
where
doRedo doc = do
- originalText <- ocr doc.oFilePath
+ originalText <- ocr doc.index.language doc.oFilePath
withGit do
J.encodeFile doc.iFilePath doc.index {originalText = originalText}
commitAll [doc.iFilePath] (printf "redo %s" (takeBaseName doc.iFilePath))
@@ -563,8 +582,8 @@ fileKey :: FilePath -> IO FilePath
fileKey filePath =
showDigest . sha1 <$> LB.readFile filePath
-consume1 :: Bool -> Bool -> FilePath -> IO FilePath
-consume1 force keep filePath = do
+consume1 :: String -> Bool -> Bool -> FilePath -> IO FilePath
+consume1 language force keep filePath = do
fKey <- fileKey filePath
let oFilePath = "originals" </> fKey <.> takeExtension filePath
originalExists <- doesFileExist oFilePath
@@ -577,7 +596,7 @@ consume1 force keep filePath = do
<$> sh (printf "pdftotext -layout '%s' -" filePath)
let hasText = (not . T.null) . T.strip $ originalText'
if not hasText
- then ocr filePath
+ then ocr language filePath
else pure originalText'
addedAt <- getCurrentTime
withGit do
@@ -608,8 +627,8 @@ decodeFile fp =
either (throwIO . DecodeException fp) pure . J.eitherDecode
=<< LB.readFile fp
-ocr :: FilePath -> IO T.Text
-ocr input =
+ocr :: String -> FilePath -> IO T.Text
+ocr language input =
withSystemTempDirectory (takeBaseName input) $ \tmp -> do
let fn suffix = tmp </> takeBaseName input <> suffix
pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input)
@@ -634,19 +653,20 @@ ocr input =
)
)
(zip pdfImages imageFiles)
- T.unlines <$> mapM (ocr1 tmp . (tmp </>)) imageFiles
+ T.unlines <$> mapM (ocr1 language tmp . (tmp </>)) imageFiles
-ocr1 :: FilePath -> FilePath -> IO T.Text
-ocr1 tmp input =
+ocr1 :: String -> FilePath -> FilePath -> IO T.Text
+ocr1 language tmp input =
T.decodeUtf8 . LB.toStrict
-- XXX `--oem 1` seems to be unavailable
- <$> sh (printf "tesseract '%s' - -l deu+eng --oem 3 --psm 1" (tmp </> input))
+ <$> sh (printf "tesseract '%s' - -l '%s' --oem 3 --psm 1" (tmp </> input) language)
data Index = Index
{ originalText :: T.Text,
tags :: S.Set Tag,
addedAt :: UTCTime,
- todo :: Bool
+ todo :: Bool,
+ language :: String
}
deriving (Show, Generic, Eq)
@@ -675,7 +695,9 @@ internalTags :: Index -> S.Set Tag
internalTags index =
S.fromList
( concat
- [ [Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt)))],
+ [ [ Tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
+ Tag "language" (Just (T.pack index.language))
+ ],
if index.todo then [Tag "todo" Nothing] else []
]
)
diff --git a/app/Settings.hs b/app/Settings.hs
index 39e8e35..f4c5811 100644
--- a/app/Settings.hs
+++ b/app/Settings.hs
@@ -23,14 +23,16 @@ import System.Environment.XDG.BaseDir (getSystemConfigFiles, getUserConfigFile)
import Text.RE.TDFA.Text qualified as R
data Settings = Settings
- { suggestedTags :: [SuggestedTag]
+ { suggestedTags :: [SuggestedTag],
+ defaultLanguage :: String
}
deriving (Show, Generic)
instance Semigroup Settings where
a <> b =
Settings
- { suggestedTags = nub (a.suggestedTags <> b.suggestedTags)
+ { suggestedTags = nub (a.suggestedTags <> b.suggestedTags),
+ defaultLanguage = b.defaultLanguage
}
instance Monoid Settings where
@@ -45,7 +47,8 @@ instance Monoid Settings where
-- - perform minor corrections on OCR, ie. parse `0 1.01.1970`
SuggestTagByRE "createdAt" [R.re|[0-9]{2}\.[0-9]{2}\.[0-9]{4}|],
SuggestTagByTags "correspondent"
- ]
+ ],
+ defaultLanguage = "deu+eng"
}
instance A.FromJSON Settings