From 52255a4158af8dfbd8f7a4dfbe14b4d46df2c601 Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Thu, 21 Dec 2023 05:37:07 +0100 Subject: chore: only process scans --- .history | 2 + apaperless.cabal | 8 ++- app/Main.hs | 172 ++++++++++++++++++++++++++++++++++++++++++----- default.nix | 44 +++++++++++++ nix/sources.json | 14 ++++ nix/sources.nix | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ shell.nix | 10 +-- 7 files changed, 421 insertions(+), 27 deletions(-) create mode 100644 .history create mode 100644 default.nix create mode 100644 nix/sources.json create mode 100644 nix/sources.nix diff --git a/.history b/.history new file mode 100644 index 0000000..56f8bde --- /dev/null +++ b/.history @@ -0,0 +1,2 @@ +pdfsandwich 0000001.pdf +tesseract img-000.ppm img-000.pdf -l eng pdf diff --git a/apaperless.cabal b/apaperless.cabal index 5001d21..81f83fb 100644 --- a/apaperless.cabal +++ b/apaperless.cabal @@ -21,11 +21,15 @@ executable apaperless main-is: Main.hs -- other-modules: -- other-extensions: - build-depends: base ^>=4.16.4.0, + build-depends: + base, typed-process, bytestring, temporary, directory, - filepath + filepath, + text, + containers, + attoparsec hs-source-dirs: app default-language: GHC2021 diff --git a/app/Main.hs b/app/Main.hs index 075414a..658b28e 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,45 +1,182 @@ +{-# LANGUAGE BlockArguments #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} + module Main where -import Control.Exception (Exception, throwIO) +import Control.Arrow (second) +import Control.Exception (Exception, throw, throwIO) +import Control.Monad (join, when) +import Data.Attoparsec.Text qualified as A import Data.ByteString.Lazy qualified as LB import Data.List +import Data.Map qualified as M +import Data.Maybe (fromMaybe) import Data.String (IsString (fromString)) +import Data.Text qualified as T +import Data.Text.Encoding qualified as T +import Debug.Trace import System.Directory import System.FilePath import System.IO.Temp (withSystemTempDirectory) import System.Process.Typed import Text.Printf (printf) +import Text.Read (readMaybe) main :: IO () main = do let input = "0000001.pdf" + ocr input + +debug :: Show a => String -> a -> a +debug s x = + trace (printf "%s: %s\n" s (show x)) x +ocr :: FilePath -> IO () +ocr input = do withSystemTempDirectory input $ \tmp -> do - sh_ (printf "pdftoppm '%s' '%s' -png -r 300" input (tmp input)) - imageInputs <- sort <$> listDirectory tmp - outputs <- - mapM - ( \imageInput -> do + let fn suffix = tmp takeBaseName input <> suffix + pdfInfo <- parsePdfInfo <$> sh (printf "pdfinfo '%s'" input) + pdfImages <- parsePdfImages <$> sh (printf "pdfimages -list '%s'" input) + hasText <- + (not . T.null) . T.strip . T.decodeUtf8 . LB.toStrict + <$> sh (printf "pdftotext '%s' -" input) + when (not hasText) do + let isScan = + length pdfImages == pdfInfo.numPages + && all ((pdfInfo.pageSize ==) . imageSize) pdfImages + if isScan + then sh_ (printf "pdfimages '%s' '%s' -tiff" input (fn "")) + else sh_ (printf "pdftoppm '%s' '%s' -r 300 -tiff" input (fn "-%d.pdf")) + imageFiles <- sort <$> listDirectory tmp + -- XXX add DPI information to image so that resulting pdf preserves DPI + mapM_ + ( \(pdfImage, imageFile) -> sh_ ( printf - "tesseract '%s' '%s' pdf -psm 1 -oem 1" - (tmp imageInput) - (tmp imageInput) + "convert -density %dx%d -units PixelsPerInch '%s' '%s'" + pdfImage.xPpi + pdfImage.yPpi + (tmp imageFile) + (tmp imageFile) ) - pure (imageInput <.> ".pdf") ) - imageInputs - sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp )) outputs ++ [printf "'%s'" (tmp input)])) - copyFile input (input <.> "bak") - copyFile (tmp input) ("." <> input) - renameFile ("." <> input) input - LB.putStr =<< sh (printf "pdftotext '%s' -" input) + (zip pdfImages imageFiles) + pdfFiles <- mapM (ocr1 tmp . (tmp )) imageFiles + sh_ ("pdfunite " ++ intercalate " " (map (printf "'%s'" . (tmp )) pdfFiles ++ [printf "'%s'" (tmp input)])) + copyFile input (input <.> "bak") + copyFile (tmp input) ("." <> input) + renameFile ("." <> input) input + +ocr1 :: FilePath -> FilePath -> IO FilePath +ocr1 tmp input = do + sh_ + ( printf + "tesseract '%s' '%s' pdf" + (tmp input) + (tmp takeBaseName input) + ) + pure (takeBaseName input <.> "pdf") + +data PdfInfo = PdfInfo + { numPages :: Int, + pageSize :: (Double, Double) + } + deriving (Show) + +data PdfInfoException = PdfInfoException + deriving (Show) + +instance Exception PdfInfoException + +parsePdfInfo :: LB.ByteString -> PdfInfo +parsePdfInfo out' = + fromMaybe (throw PdfInfoException) $ do + numPages <- readMaybe . T.unpack =<< M.lookup "Pages" kvs + pageSize <- + rightToMaybe . A.parseOnly pageSizeParser + =<< M.lookup "Page size" kvs + pure PdfInfo {..} + where + out = T.decodeUtf8 (LB.toStrict out') + kvs = + M.fromList + . map (second T.stripStart) + . map (second T.tail . T.break (== ':')) + . filter (not . T.null) + . T.lines + $ out + pageSizeParser = + (,) + <$> (A.double <* A.string " x ") + <*> (A.double <* A.string " pts (A4)") + <* A.endOfInput + +type PdfImages = [PdfImage] + +data PdfImage = PdfImage + { page :: Int, + num :: Int, + type_ :: String, + width :: Int, + height :: Int, + color :: String, + comp :: Int, + bpc :: Int, + enc :: String, + interp :: String, + object :: Int, + id :: Int, + xPpi :: Int, + yPpi :: Int, + size :: String, + ratio :: String + } + deriving (Show) + +imageSize :: PdfImage -> (Double, Double) +imageSize (PdfImage {..}) = + let f ppi p = 72 * fromIntegral p / fromIntegral ppi + in (f xPpi width, f yPpi height) + +data PdfImagesException = PdfImagesException + deriving (Show) + +instance Exception PdfImagesException data ProcessException = ProcessException Int LB.ByteString deriving (Show) instance Exception ProcessException +parsePdfImages :: LB.ByteString -> PdfImages +parsePdfImages out' = + map + ( \(page' : num' : type_ : width' : height' : color : comp' : bpc' : enc : interp : object' : id' : xPpi' : yPpi' : size : ratio : []) -> + PdfImage + { page = read page', + num = read num', + width = read width', + height = read height', + comp = read comp', + bpc = read bpc', + object = read object', + id = read id', + xPpi = read xPpi', + yPpi = read yPpi', + .. + } + ) + . map (map T.unpack) + . map T.words + . drop 2 + . filter (not . T.null) + . T.lines + $ out + where + out = T.decodeUtf8 (LB.toStrict out') + sh :: String -> IO LB.ByteString sh cmd = do -- printf "+ %s\n" cmd @@ -50,3 +187,6 @@ sh cmd = do sh_ :: String -> IO () sh_ = fmap (\_ -> ()) . sh + +rightToMaybe :: Either e a -> Maybe a +rightToMaybe = either (const Nothing) Just diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..16d73b0 --- /dev/null +++ b/default.nix @@ -0,0 +1,44 @@ +{ sources ? import ./nix/sources.nix +, pkgs ? import sources.nixpkgs { + overlays = [ + (self: super: { + apaperless = pkgs.writers.writeDashBin "apaperless" '' + set -efu + exec cabal run apaperless -- "$@" + ''; + }) + ]; + } +}: +let + haskellPackages = pkgs.haskellPackages.override { + overrides = self: super: { + apaperless = super.callCabal2nix "apaperless" ./. { }; + }; + }; +in +rec { + inherit (haskellPackages) apaperless; + shell = haskellPackages.shellFor { + packages = _: [ apaperless ]; + buildInputs = [ + haskellPackages.cabal-install + haskellPackages.hlint + haskellPackages.ormolu + haskellPackages.pointfree + pkgs.apaperless + pkgs.ghcid + pkgs.haskell-language-server + pkgs.niv + pkgs.tesseract + pkgs.pdfsandwich + pkgs.unpaper + ]; + withHoogle = true; + withHaddock = true; + shellHook = '' + HISTFILE=${pkgs.lib.escapeShellArg ./.}/.history; export HISTFILE + ''; + }; +} + diff --git a/nix/sources.json b/nix/sources.json new file mode 100644 index 0000000..a639c84 --- /dev/null +++ b/nix/sources.json @@ -0,0 +1,14 @@ +{ + "nixpkgs": { + "branch": "nixos-unstable", + "description": "Nix Packages collection", + "homepage": null, + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "de8562bcdef27d9b76528e6e95a0902fb3fb7414", + "sha256": "0vwjdr4f2x4bd9zl72wf0hzl4ipagyhdl3ivlz3qrv42a5nd6hds", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/de8562bcdef27d9b76528e6e95a0902fb3fb7414.tar.gz", + "url_template": "https://github.com///archive/.tar.gz" + } +} diff --git a/nix/sources.nix b/nix/sources.nix new file mode 100644 index 0000000..fe3dadf --- /dev/null +++ b/nix/sources.nix @@ -0,0 +1,198 @@ +# This file has been generated by Niv. + +let + + # + # The fetchers. fetch_ fetches specs of type . + # + + fetch_file = pkgs: name: spec: + let + name' = sanitizeName name + "-src"; + in + if spec.builtin or true then + builtins_fetchurl { inherit (spec) url sha256; name = name'; } + else + pkgs.fetchurl { inherit (spec) url sha256; name = name'; }; + + fetch_tarball = pkgs: name: spec: + let + name' = sanitizeName name + "-src"; + in + if spec.builtin or true then + builtins_fetchTarball { name = name'; inherit (spec) url sha256; } + else + pkgs.fetchzip { name = name'; inherit (spec) url sha256; }; + + fetch_git = name: spec: + let + ref = + spec.ref or ( + if spec ? branch then "refs/heads/${spec.branch}" else + if spec ? tag then "refs/tags/${spec.tag}" else + abort "In git source '${name}': Please specify `ref`, `tag` or `branch`!" + ); + submodules = spec.submodules or false; + submoduleArg = + let + nixSupportsSubmodules = builtins.compareVersions builtins.nixVersion "2.4" >= 0; + emptyArgWithWarning = + if submodules + then + builtins.trace + ( + "The niv input \"${name}\" uses submodules " + + "but your nix's (${builtins.nixVersion}) builtins.fetchGit " + + "does not support them" + ) + { } + else { }; + in + if nixSupportsSubmodules + then { inherit submodules; } + else emptyArgWithWarning; + in + builtins.fetchGit + ({ url = spec.repo; inherit (spec) rev; inherit ref; } // submoduleArg); + + fetch_local = spec: spec.path; + + fetch_builtin-tarball = name: throw + ''[${name}] The niv type "builtin-tarball" is deprecated. You should instead use `builtin = true`. + $ niv modify ${name} -a type=tarball -a builtin=true''; + + fetch_builtin-url = name: throw + ''[${name}] The niv type "builtin-url" will soon be deprecated. You should instead use `builtin = true`. + $ niv modify ${name} -a type=file -a builtin=true''; + + # + # Various helpers + # + + # https://github.com/NixOS/nixpkgs/pull/83241/files#diff-c6f540a4f3bfa4b0e8b6bafd4cd54e8bR695 + sanitizeName = name: + ( + concatMapStrings (s: if builtins.isList s then "-" else s) + ( + builtins.split "[^[:alnum:]+._?=-]+" + ((x: builtins.elemAt (builtins.match "\\.*(.*)" x) 0) name) + ) + ); + + # The set of packages used when specs are fetched using non-builtins. + mkPkgs = sources: system: + let + sourcesNixpkgs = + import (builtins_fetchTarball { inherit (sources.nixpkgs) url sha256; }) { inherit system; }; + hasNixpkgsPath = builtins.any (x: x.prefix == "nixpkgs") builtins.nixPath; + hasThisAsNixpkgsPath = == ./.; + in + if builtins.hasAttr "nixpkgs" sources + then sourcesNixpkgs + else if hasNixpkgsPath && ! hasThisAsNixpkgsPath then + import { } + else + abort + '' + Please specify either (through -I or NIX_PATH=nixpkgs=...) or + add a package called "nixpkgs" to your sources.json. + ''; + + # The actual fetching function. + fetch = pkgs: name: spec: + + if ! builtins.hasAttr "type" spec then + abort "ERROR: niv spec ${name} does not have a 'type' attribute" + else if spec.type == "file" then fetch_file pkgs name spec + else if spec.type == "tarball" then fetch_tarball pkgs name spec + else if spec.type == "git" then fetch_git name spec + else if spec.type == "local" then fetch_local spec + else if spec.type == "builtin-tarball" then fetch_builtin-tarball name + else if spec.type == "builtin-url" then fetch_builtin-url name + else + abort "ERROR: niv spec ${name} has unknown type ${builtins.toJSON spec.type}"; + + # If the environment variable NIV_OVERRIDE_${name} is set, then use + # the path directly as opposed to the fetched source. + replace = name: drv: + let + saneName = stringAsChars (c: if (builtins.match "[a-zA-Z0-9]" c) == null then "_" else c) name; + ersatz = builtins.getEnv "NIV_OVERRIDE_${saneName}"; + in + if ersatz == "" then drv else + # this turns the string into an actual Nix path (for both absolute and + # relative paths) + if builtins.substring 0 1 ersatz == "/" then /. + ersatz else /. + builtins.getEnv "PWD" + "/${ersatz}"; + + # Ports of functions for older nix versions + + # a Nix version of mapAttrs if the built-in doesn't exist + mapAttrs = builtins.mapAttrs or ( + f: set: with builtins; + listToAttrs (map (attr: { name = attr; value = f attr set.${attr}; }) (attrNames set)) + ); + + # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/lists.nix#L295 + range = first: last: if first > last then [ ] else builtins.genList (n: first + n) (last - first + 1); + + # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L257 + stringToCharacters = s: map (p: builtins.substring p 1 s) (range 0 (builtins.stringLength s - 1)); + + # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L269 + stringAsChars = f: s: concatStrings (map f (stringToCharacters s)); + concatMapStrings = f: list: concatStrings (map f list); + concatStrings = builtins.concatStringsSep ""; + + # https://github.com/NixOS/nixpkgs/blob/8a9f58a375c401b96da862d969f66429def1d118/lib/attrsets.nix#L331 + optionalAttrs = cond: as: if cond then as else { }; + + # fetchTarball version that is compatible between all the versions of Nix + builtins_fetchTarball = { url, name ? null, sha256 }@attrs: + let + inherit (builtins) lessThan nixVersion fetchTarball; + in + if lessThan nixVersion "1.12" then + fetchTarball ({ inherit url; } // (optionalAttrs (name != null) { inherit name; })) + else + fetchTarball attrs; + + # fetchurl version that is compatible between all the versions of Nix + builtins_fetchurl = { url, name ? null, sha256 }@attrs: + let + inherit (builtins) lessThan nixVersion fetchurl; + in + if lessThan nixVersion "1.12" then + fetchurl ({ inherit url; } // (optionalAttrs (name != null) { inherit name; })) + else + fetchurl attrs; + + # Create the final "sources" from the config + mkSources = config: + mapAttrs + ( + name: spec: + if builtins.hasAttr "outPath" spec + then + abort + "The values in sources.json should not have an 'outPath' attribute" + else + spec // { outPath = replace name (fetch config.pkgs name spec); } + ) + config.sources; + + # The "config" used by the fetchers + mkConfig = + { sourcesFile ? if builtins.pathExists ./sources.json then ./sources.json else null + , sources ? if sourcesFile == null then { } else builtins.fromJSON (builtins.readFile sourcesFile) + , system ? builtins.currentSystem + , pkgs ? mkPkgs sources system + }: rec { + # The sources, i.e. the attribute set of spec name to spec + inherit sources; + + # The "pkgs" (evaluated nixpkgs) to use for e.g. non-builtin fetchers + inherit pkgs; + }; + +in +mkSources (mkConfig { }) // { __functor = _: settings: mkSources (mkConfig settings); } diff --git a/shell.nix b/shell.nix index ab82cf6..fbd4670 100644 --- a/shell.nix +++ b/shell.nix @@ -1,9 +1 @@ -{ pkgs ? import { } }: -pkgs.mkShell { - buildInputs = [ - pkgs.haskellPackages.cabal-install - pkgs.ormolu - pkgs.poppler - pkgs.tesseract - ]; -} +(import ./default.nix { }).shell -- cgit v1.2.3