summaryrefslogtreecommitdiffstats
path: root/app/Document.hs
diff options
context:
space:
mode:
Diffstat (limited to 'app/Document.hs')
-rw-r--r--app/Document.hs134
1 files changed, 134 insertions, 0 deletions
diff --git a/app/Document.hs b/app/Document.hs
new file mode 100644
index 0000000..ac8a73b
--- /dev/null
+++ b/app/Document.hs
@@ -0,0 +1,134 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StrictData #-}
+
+module Document
+ ( Document (..),
+ Index (..),
+ hasWord,
+ hasTag,
+ hasWordAndTag,
+ tagValues,
+ getDocuments,
+ readDocument,
+ )
+where
+
+import Control.Concurrent.ParallelIO.Local (parallel, withPool)
+import Control.Exception (Exception, throwIO)
+import Data.Aeson qualified as J
+import Data.ByteString.Lazy qualified as LB
+import Data.List (sort)
+import Data.Map qualified as M
+import Data.Maybe (mapMaybe, maybeToList)
+import Data.Set qualified as S
+import Data.Text qualified as T
+import Data.Time.Clock (UTCTime)
+import Data.Time.Format.ISO8601 (iso8601Show)
+import GHC.Conc (getNumProcessors)
+import GHC.Generics (Generic)
+import GHC.Records (HasField (..))
+import System.Directory (listDirectory)
+import System.FilePath (takeBaseName, (<.>), (</>))
+import Tag qualified as G
+
+-- TODO Inline `Index`
+
+-- TODO Replace `iFileName` with `id`
+data Document = Document
+ { iFileName :: String,
+ index :: Index
+ }
+ deriving (Show)
+
+instance HasField "oFilePath" Document FilePath where
+ getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
+
+instance HasField "iFilePath" Document FilePath where
+ getField doc = "index" </> doc.iFileName
+
+hasWord :: T.Text -> Document -> Bool
+hasWord word doc = S.member word doc.index.originalWords
+
+hasTag :: G.Tag -> Document -> Bool
+hasTag tag doc = S.member tag doc.index.tags
+
+hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
+hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
+
+tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
+tagValues docs =
+ M.unionsWith S.union $
+ mapMaybe
+ ( \tag ->
+ M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
+ )
+ (S.toList (S.unions (map (.index.tags) docs)))
+
+getDocuments :: IO [Document]
+getDocuments =
+ parMapM readDocument
+ =<< sort <$> listDirectory "index"
+
+readDocument :: FilePath -> IO Document
+readDocument iFileName =
+ Document iFileName
+ <$> decodeFile @Index ("index" </> iFileName)
+
+data Index = Index
+ { content :: T.Text,
+ tags :: S.Set G.Tag,
+ addedAt :: UTCTime,
+ modifiedAt :: Maybe UTCTime,
+ todo :: Bool,
+ language :: String
+ }
+ deriving (Show, Generic, Eq)
+
+instance J.ToJSON Index
+
+instance J.FromJSON Index
+
+instance HasField "shortText" Index T.Text where
+ getField =
+ (T.unlines . take 10 . T.lines)
+ . (.content)
+
+instance HasField "internalTags" Index (S.Set G.Tag) where
+ getField index =
+ index.tags `S.union` internalTags index
+
+-- TODO Cache `originalWords`
+--
+-- @related cache-probabilitymap
+instance HasField "originalWords" Index (S.Set T.Text) where
+ getField index =
+ S.fromList (T.words index.content)
+
+internalTags :: Index -> S.Set G.Tag
+internalTags index =
+ S.fromList
+ ( concat
+ [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
+ G.tag "language" (Just (T.pack index.language))
+ ],
+ maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
+ if index.todo then [G.tag "todo" Nothing] else []
+ ]
+ )
+
+parMapM :: (a -> IO b) -> [a] -> IO [b]
+parMapM f xs = do
+ n <- getNumProcessors
+ withPool n $ \pool -> parallel pool (map f xs)
+
+data DecodeException = DecodeException FilePath String
+ deriving (Show)
+
+instance Exception DecodeException
+
+decodeFile :: J.FromJSON a => FilePath -> IO a
+decodeFile fp =
+ either (throwIO . DecodeException fp) pure . J.eitherDecode
+ =<< LB.readFile fp