{-# LANGUAGE DataKinds #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE StrictData #-} module Document ( Document (..), Index (..), hasWord, hasTag, hasWordAndTag, tagValues, getDocuments, readDocument, ) where import Control.Concurrent.ParallelIO.Local (parallel, withPool) import Control.Exception (Exception, throwIO) import Data.Aeson qualified as J import Data.ByteString.Lazy qualified as LB import Data.List (sort) import Data.Map qualified as M import Data.Maybe (mapMaybe, maybeToList) import Data.Set qualified as S import Data.Text qualified as T import Data.Time.Clock (UTCTime) import Data.Time.Format.ISO8601 (iso8601Show) import GHC.Conc (getNumProcessors) import GHC.Generics (Generic) import GHC.Records (HasField (..)) import System.Directory (listDirectory) import System.FilePath (takeBaseName, (<.>), ()) import Tag qualified as G -- TODO Inline `Index` -- TODO Replace `iFileName` with `id` data Document = Document { iFileName :: String, index :: Index } deriving (Show) instance HasField "oFilePath" Document FilePath where getField doc = "originals" takeBaseName doc.iFileName <.> "pdf" instance HasField "iFilePath" Document FilePath where getField doc = "index" doc.iFileName hasWord :: T.Text -> Document -> Bool hasWord word doc = S.member word doc.index.originalWords hasTag :: G.Tag -> Document -> Bool hasTag tag doc = S.member tag doc.index.tags hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) tagValues docs = M.unionsWith S.union $ mapMaybe ( \tag -> M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag) ) (S.toList (S.unions (map (.index.tags) docs))) getDocuments :: IO [Document] getDocuments = parMapM readDocument =<< sort <$> listDirectory "index" readDocument :: FilePath -> IO Document readDocument iFileName = Document iFileName <$> decodeFile @Index ("index" iFileName) data Index = Index { content :: T.Text, tags :: S.Set G.Tag, addedAt :: UTCTime, modifiedAt :: Maybe UTCTime, todo :: Bool, language :: String } deriving (Show, Generic, Eq) instance J.ToJSON Index instance J.FromJSON Index instance HasField "shortText" Index T.Text where getField = (T.unlines . take 10 . T.lines) . (.content) instance HasField "internalTags" Index (S.Set G.Tag) where getField index = index.tags `S.union` internalTags index -- TODO Cache `originalWords` -- -- @related cache-probabilitymap instance HasField "originalWords" Index (S.Set T.Text) where getField index = S.fromList (T.words index.content) internalTags :: Index -> S.Set G.Tag internalTags index = S.fromList ( concat [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), G.tag "language" (Just (T.pack index.language)) ], maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt), if index.todo then [G.tag "todo" Nothing] else [] ] ) parMapM :: (a -> IO b) -> [a] -> IO [b] parMapM f xs = do n <- getNumProcessors withPool n $ \pool -> parallel pool (map f xs) data DecodeException = DecodeException FilePath String deriving (Show) instance Exception DecodeException decodeFile :: J.FromJSON a => FilePath -> IO a decodeFile fp = either (throwIO . DecodeException fp) pure . J.eitherDecode =<< LB.readFile fp