diff options
author | Alexander Foremny <aforemny@posteo.de> | 2024-01-11 03:20:56 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2024-01-11 03:26:52 +0100 |
commit | 7bdf16be84b368655ce2ee3d9ab6bf185dfb59b5 (patch) | |
tree | 3ee48fc98f98ab7ac7ad19e24334e07b8b147dd6 /app/ProbabilityMap.hs | |
parent | 673c59d9be8b62106ffbba96d805680f0b5e7e3f (diff) |
chore: make computing `probabilityMap` more performance
Diffstat (limited to 'app/ProbabilityMap.hs')
-rw-r--r-- | app/ProbabilityMap.hs | 230 |
1 files changed, 230 insertions, 0 deletions
diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs new file mode 100644 index 0000000..9a700d7 --- /dev/null +++ b/app/ProbabilityMap.hs @@ -0,0 +1,230 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DuplicateRecordFields #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE StrictData #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE NoFieldSelectors #-} +{-# OPTIONS_GHC -fno-warn-name-shadowing #-} + +module ProbabilityMap + ( fileName, + ProbabilityCache (..), + readProbabilityCache, + writeProbabilityCache, + modifyProbabilityCache, + fromDocuments, + addDocument, + deleteDocument, + replaceDocument, + ) +where + +import Control.Applicative (liftA2) +import Control.Arrow ((***)) +import Control.DeepSeq (NFData (..)) +import Control.Exception (SomeException, try) +import Control.Parallel.Strategies +import Data.Binary qualified as B +import Data.List (foldl') +import Data.Map qualified as M +import Data.Maybe (fromMaybe) +import Data.Set qualified as S +import Data.Text qualified as T +import Data.Vector qualified as V +import Data.Vector.Binary () +import Document qualified as D +import GHC.Conc (numCapabilities) +import GHC.Generics (Generic) +import Tag qualified as G + +data ProbabilityCache = ProbabilityCache + { docsPerWord :: M.Map T.Text Int, + docsPerTag :: M.Map G.Tag Int, + docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int, + probabilityMap :: M.Map (T.Text, G.Tag) Double + } + deriving (Show, Generic) + +data ProbabilityCache' = ProbabilityCache' + { docsPerWord :: M.Map T.Text Int, + docsPerTag :: M.Map G.Tag Int, + docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int + } + deriving (Show, Generic) + +instance NFData ProbabilityCache' where + rnf (ProbabilityCache' {..}) = + rnf docsPerWord `seq` + rnf docsPerTag `seq` + rnf docsPerWordAndTag + +fromProbabilityCache' :: ProbabilityCache' -> ProbabilityCache +fromProbabilityCache' probabilityCache'@(ProbabilityCache' {..}) = + ProbabilityCache {probabilityMap = probabilityMap probabilityCache', ..} + +toProbabilityCache' :: ProbabilityCache -> ProbabilityCache' +toProbabilityCache' (ProbabilityCache {..}) = + ProbabilityCache' {..} + +instance B.Binary ProbabilityCache where + get = fromCacheRep <$> B.get + put = B.put . toCacheRep + +instance NFData ProbabilityCache where + rnf (ProbabilityCache {..}) = + rnf docsPerWord `seq` + rnf docsPerTag `seq` + rnf docsPerWordAndTag `seq` + rnf probabilityMap + +data CacheRep = CacheRep + { allWords :: V.Vector T.Text, + allTags :: V.Vector G.Tag, + docsPerWord :: M.Map Int Int, + docsPerTag :: M.Map Int Int, + docsPerWordAndTag :: M.Map (Int, Int) Int, + probabilityMap :: M.Map (Int, Int) Double + } + deriving (Show, Generic, B.Binary) + +instance NFData CacheRep where + rnf (CacheRep {..}) = + rnf allWords `seq` + rnf allTags `seq` + rnf docsPerWord `seq` + rnf docsPerTag `seq` + rnf docsPerWordAndTag `seq` + rnf probabilityMap + +fromCacheRep :: CacheRep -> ProbabilityCache +fromCacheRep (CacheRep {..}) = + ProbabilityCache + { docsPerWord = M.mapKeys fromWord docsPerWord, + docsPerTag = M.mapKeys fromTag docsPerTag, + docsPerWordAndTag = M.mapKeys fromWordAndTag docsPerWordAndTag, + probabilityMap = M.mapKeys fromWordAndTag probabilityMap + } + where + fromWord = (allWords V.!) + fromTag = (allTags V.!) + fromWordAndTag = fromWord *** fromTag + +toCacheRep :: ProbabilityCache -> CacheRep +toCacheRep (ProbabilityCache {..}) = + CacheRep + { allWords = V.fromList allWords', + allTags = V.fromList allTags', + docsPerWord = M.mapKeys toWord docsPerWord, + docsPerTag = M.mapKeys toTag docsPerTag, + docsPerWordAndTag = M.mapKeys toWordAndTag docsPerWordAndTag, + probabilityMap = M.mapKeys toWordAndTag probabilityMap + } + where + allWords' = M.keys docsPerWord + allTags' = M.keys docsPerTag + allWordsMap = M.fromList $ zip allWords' [0 ..] + allTagsMap = M.fromList $ zip allTags' [0 ..] + toWord = (allWordsMap M.!) + toTag = (allTagsMap M.!) + toWordAndTag = toWord *** toTag + +fileName :: FilePath +fileName = "probabilityCache" + +readProbabilityCache :: IO ProbabilityCache +readProbabilityCache = + try (B.decodeFile fileName) >>= \case + Left (_ :: SomeException) -> fromDocuments <$> D.getDocuments + Right probabilityCache -> pure probabilityCache + +writeProbabilityCache :: ProbabilityCache -> IO () +writeProbabilityCache probabilityCache = + B.encodeFile fileName probabilityCache + +modifyProbabilityCache :: (ProbabilityCache -> ProbabilityCache) -> IO ProbabilityCache +modifyProbabilityCache f = + liftA2 (>>) (writeProbabilityCache . f) pure =<< readProbabilityCache + +fromDocuments :: [D.Document] -> ProbabilityCache +fromDocuments allDocs = + fromProbabilityCache' $ + foldl' (flip addDocument') (ProbabilityCache' M.empty M.empty M.empty) allDocs + +probabilityMap :: ProbabilityCache' -> M.Map (T.Text, G.Tag) Double +probabilityMap (ProbabilityCache' {..}) = + M.unions + . withStrategy (parList rdeepseq) + . map + ( M.fromList + . filter (\(_, p) -> p > 0) + . map (\k@(word, tag) -> (k, probability word tag)) + ) + $ streamsOf + numCapabilities + [ (word, tag) + | word <- allWords, + tag <- allTags + ] + where + probability word tag = + let docsWithWord = docsPerWord M.! word + docsWithTag = docsPerTag M.! tag + docsWithWordAndTag = fromMaybe 0 $ M.lookup (word, tag) docsPerWordAndTag + in fi docsWithWordAndTag / fi (docsWithWord + docsWithTag - docsWithWordAndTag) + + allWords = M.keys docsPerWord + allTags = M.keys docsPerTag + fi = fromIntegral @Int @Double + +streamsOf :: Int -> [a] -> [[a]] +streamsOf 1 xs = [xs] +streamsOf n xs + | n > 0 = [everyN k xs | k <- [0 .. n - 1]] + | otherwise = [] + where + everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs + +addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache +addDocument doc probabilityCache = + fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache) + +deleteDocument :: D.Document -> ProbabilityCache -> ProbabilityCache +deleteDocument doc probabilityCache = + fromProbabilityCache' $ deleteDocument' doc (toProbabilityCache' probabilityCache) + +replaceDocument :: D.Document -> D.Document -> ProbabilityCache -> ProbabilityCache +replaceDocument oldDoc newDoc probabilityCache = + fromProbabilityCache' $ + (addDocument' newDoc . deleteDocument' oldDoc) (toProbabilityCache' probabilityCache) + +addDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache' +addDocument' doc probabilityCache = + combining (+) probabilityCache (fromDocument doc) + +deleteDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache' +deleteDocument' doc probabilityCache = + combining (-) probabilityCache (fromDocument doc) + +combining :: (Int -> Int -> Int) -> ProbabilityCache' -> ProbabilityCache' -> ProbabilityCache' +combining f probabilityCache probabilityCache' = + ProbabilityCache' {..} + where + docsPerWord = M.unionWith f probabilityCache.docsPerWord probabilityCache'.docsPerWord + docsPerTag = M.unionWith f probabilityCache.docsPerTag probabilityCache'.docsPerTag + docsPerWordAndTag = M.unionWith f probabilityCache.docsPerWordAndTag probabilityCache'.docsPerWordAndTag + +fromDocument :: D.Document -> ProbabilityCache' +fromDocument doc = + let words = S.toList doc.index.originalWords + tags = S.toList doc.index.tags + wordAndTags = [(word, tag) | word <- words, tag <- tags] + in ProbabilityCache' + { docsPerWord = docsPer words, + docsPerTag = docsPer tags, + docsPerWordAndTag = docsPer wordAndTags + } + where + docsPer :: Ord k => [k] -> M.Map k Int + docsPer = M.fromList . map (flip (,) 1) |