From 7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Thu, 11 Jan 2024 03:20:56 +0100 Subject: chore: make computing `probabilityMap` more performance --- app/Document.hs | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 app/Document.hs (limited to 'app/Document.hs') diff --git a/app/Document.hs b/app/Document.hs new file mode 100644 index 0000000..ac8a73b --- /dev/null +++ b/app/Document.hs @@ -0,0 +1,134 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE StrictData #-} + +module Document + ( Document (..), + Index (..), + hasWord, + hasTag, + hasWordAndTag, + tagValues, + getDocuments, + readDocument, + ) +where + +import Control.Concurrent.ParallelIO.Local (parallel, withPool) +import Control.Exception (Exception, throwIO) +import Data.Aeson qualified as J +import Data.ByteString.Lazy qualified as LB +import Data.List (sort) +import Data.Map qualified as M +import Data.Maybe (mapMaybe, maybeToList) +import Data.Set qualified as S +import Data.Text qualified as T +import Data.Time.Clock (UTCTime) +import Data.Time.Format.ISO8601 (iso8601Show) +import GHC.Conc (getNumProcessors) +import GHC.Generics (Generic) +import GHC.Records (HasField (..)) +import System.Directory (listDirectory) +import System.FilePath (takeBaseName, (<.>), ()) +import Tag qualified as G + +-- TODO Inline `Index` + +-- TODO Replace `iFileName` with `id` +data Document = Document + { iFileName :: String, + index :: Index + } + deriving (Show) + +instance HasField "oFilePath" Document FilePath where + getField doc = "originals" takeBaseName doc.iFileName <.> "pdf" + +instance HasField "iFilePath" Document FilePath where + getField doc = "index" doc.iFileName + +hasWord :: T.Text -> Document -> Bool +hasWord word doc = S.member word doc.index.originalWords + +hasTag :: G.Tag -> Document -> Bool +hasTag tag doc = S.member tag doc.index.tags + +hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool +hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc + +tagValues :: [Document] -> M.Map T.Text (S.Set T.Text) +tagValues docs = + M.unionsWith S.union $ + mapMaybe + ( \tag -> + M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag) + ) + (S.toList (S.unions (map (.index.tags) docs))) + +getDocuments :: IO [Document] +getDocuments = + parMapM readDocument + =<< sort <$> listDirectory "index" + +readDocument :: FilePath -> IO Document +readDocument iFileName = + Document iFileName + <$> decodeFile @Index ("index" iFileName) + +data Index = Index + { content :: T.Text, + tags :: S.Set G.Tag, + addedAt :: UTCTime, + modifiedAt :: Maybe UTCTime, + todo :: Bool, + language :: String + } + deriving (Show, Generic, Eq) + +instance J.ToJSON Index + +instance J.FromJSON Index + +instance HasField "shortText" Index T.Text where + getField = + (T.unlines . take 10 . T.lines) + . (.content) + +instance HasField "internalTags" Index (S.Set G.Tag) where + getField index = + index.tags `S.union` internalTags index + +-- TODO Cache `originalWords` +-- +-- @related cache-probabilitymap +instance HasField "originalWords" Index (S.Set T.Text) where + getField index = + S.fromList (T.words index.content) + +internalTags :: Index -> S.Set G.Tag +internalTags index = + S.fromList + ( concat + [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))), + G.tag "language" (Just (T.pack index.language)) + ], + maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt), + if index.todo then [G.tag "todo" Nothing] else [] + ] + ) + +parMapM :: (a -> IO b) -> [a] -> IO [b] +parMapM f xs = do + n <- getNumProcessors + withPool n $ \pool -> parallel pool (map f xs) + +data DecodeException = DecodeException FilePath String + deriving (Show) + +instance Exception DecodeException + +decodeFile :: J.FromJSON a => FilePath -> IO a +decodeFile fp = + either (throwIO . DecodeException fp) pure . J.eitherDecode + =<< LB.readFile fp -- cgit v1.2.3