From f86cbf2598c6421a4ceab55c5fd27da12989cda3 Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Wed, 7 Feb 2024 04:43:19 +0100 Subject: chore: normalize UTF-8 --- app/Main.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'app') diff --git a/app/Main.hs b/app/Main.hs index 479f3b0..0d4edcc 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -36,6 +36,7 @@ import Data.String (IsString (fromString)) import Data.Text qualified as T import Data.Text.Encoding qualified as T import Data.Text.IO qualified as T +import Data.Text.Normalize qualified as T import Data.Time.Clock (getCurrentTime) import Data.Time.Format.ISO8601 (iso8601ParseM) import Debug.Trace @@ -759,7 +760,7 @@ consume1 language force keep filePath = do let iFilePath = "index" fKey <.> "json" content <- do content' <- - T.decodeUtf8 . LB.toStrict + T.normalize T.NFC . T.decodeUtf8 . LB.toStrict <$> sh (printf "pdftotext -layout '%s' -" filePath) let hasText = (not . T.null) . T.strip $ content' if not hasText -- cgit v1.2.3