{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE StrictData #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE NoFieldSelectors #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} module ProbabilityMap ( fileName, ProbabilityCache (..), readProbabilityCache, writeProbabilityCache, modifyProbabilityCache, fromDocuments, addDocument, deleteDocument, replaceDocument, ) where import Control.Applicative (liftA2) import Control.Arrow ((***)) import Control.DeepSeq (NFData (..)) import Control.Exception (SomeException, try) import Control.Parallel.Strategies (parList, rdeepseq, withStrategy) import Data.Binary qualified as B import Data.List (foldl') import Data.Map qualified as M import Data.Maybe (fromMaybe) import Data.Set qualified as S import Data.Text qualified as T import Data.Vector qualified as V import Data.Vector.Binary () import Document qualified as D import GHC.Conc (numCapabilities) import GHC.Generics (Generic) import Parallel (streamsOf) import Tag qualified as G data ProbabilityCache = ProbabilityCache { docsPerWord :: M.Map T.Text Int, docsPerTag :: M.Map G.Tag Int, docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int, probabilityMap :: M.Map (T.Text, G.Tag) Double } deriving (Show, Generic) data ProbabilityCache' = ProbabilityCache' { docsPerWord :: M.Map T.Text Int, docsPerTag :: M.Map G.Tag Int, docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int } deriving (Show, Generic) instance NFData ProbabilityCache' where rnf (ProbabilityCache' {..}) = rnf docsPerWord `seq` rnf docsPerTag `seq` rnf docsPerWordAndTag fromProbabilityCache' :: ProbabilityCache' -> ProbabilityCache fromProbabilityCache' probabilityCache'@(ProbabilityCache' {..}) = ProbabilityCache {probabilityMap = probabilityMap probabilityCache', ..} toProbabilityCache' :: ProbabilityCache -> ProbabilityCache' toProbabilityCache' (ProbabilityCache {..}) = ProbabilityCache' {..} instance B.Binary ProbabilityCache where get = fromCacheRep <$> B.get put = B.put . toCacheRep instance NFData ProbabilityCache where rnf (ProbabilityCache {..}) = rnf docsPerWord `seq` rnf docsPerTag `seq` rnf docsPerWordAndTag `seq` rnf probabilityMap data CacheRep = CacheRep { allWords :: V.Vector T.Text, allTags :: V.Vector G.Tag, docsPerWord :: M.Map Int Int, docsPerTag :: M.Map Int Int, docsPerWordAndTag :: M.Map (Int, Int) Int, probabilityMap :: M.Map (Int, Int) Double } deriving (Show, Generic, B.Binary) instance NFData CacheRep where rnf (CacheRep {..}) = rnf allWords `seq` rnf allTags `seq` rnf docsPerWord `seq` rnf docsPerTag `seq` rnf docsPerWordAndTag `seq` rnf probabilityMap fromCacheRep :: CacheRep -> ProbabilityCache fromCacheRep (CacheRep {..}) = ProbabilityCache { docsPerWord = M.mapKeys fromWord docsPerWord, docsPerTag = M.mapKeys fromTag docsPerTag, docsPerWordAndTag = M.mapKeys fromWordAndTag docsPerWordAndTag, probabilityMap = M.mapKeys fromWordAndTag probabilityMap } where fromWord = (allWords V.!) fromTag = (allTags V.!) fromWordAndTag = fromWord *** fromTag toCacheRep :: ProbabilityCache -> CacheRep toCacheRep (ProbabilityCache {..}) = CacheRep { allWords = V.fromList allWords', allTags = V.fromList allTags', docsPerWord = M.mapKeys toWord docsPerWord, docsPerTag = M.mapKeys toTag docsPerTag, docsPerWordAndTag = M.mapKeys toWordAndTag docsPerWordAndTag, probabilityMap = M.mapKeys toWordAndTag probabilityMap } where allWords' = M.keys docsPerWord allTags' = M.keys docsPerTag allWordsMap = M.fromList $ zip allWords' [0 ..] allTagsMap = M.fromList $ zip allTags' [0 ..] toWord = (allWordsMap M.!) toTag = (allTagsMap M.!) toWordAndTag = toWord *** toTag fileName :: FilePath fileName = "probabilityCache" readProbabilityCache :: IO ProbabilityCache readProbabilityCache = try (B.decodeFile fileName) >>= \case Left (_ :: SomeException) -> fromDocuments <$> D.getDocuments Right probabilityCache -> pure probabilityCache writeProbabilityCache :: ProbabilityCache -> IO () writeProbabilityCache probabilityCache = B.encodeFile fileName probabilityCache modifyProbabilityCache :: (ProbabilityCache -> ProbabilityCache) -> IO ProbabilityCache modifyProbabilityCache f = liftA2 (>>) (writeProbabilityCache . f) pure =<< readProbabilityCache fromDocuments :: [D.Document] -> ProbabilityCache fromDocuments allDocs = fromProbabilityCache' $ foldl' (flip addDocument') (ProbabilityCache' M.empty M.empty M.empty) allDocs probabilityMap :: ProbabilityCache' -> M.Map (T.Text, G.Tag) Double probabilityMap (ProbabilityCache' {..}) = M.unions . withStrategy (parList rdeepseq) . map ( M.fromList . filter (\(_, p) -> p > 0) . map (\k@(word, tag) -> (k, probability word tag)) ) $ streamsOf numCapabilities [ (word, tag) | word <- allWords, tag <- allTags ] where probability word tag = let docsWithWord = docsPerWord M.! word docsWithTag = docsPerTag M.! tag docsWithWordAndTag = fromMaybe 0 $ M.lookup (word, tag) docsPerWordAndTag in fi docsWithWordAndTag / fi (docsWithWord + docsWithTag - docsWithWordAndTag) allWords = M.keys docsPerWord allTags = M.keys docsPerTag fi = fromIntegral @Int @Double addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache addDocument doc probabilityCache = fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache) deleteDocument :: D.Document -> ProbabilityCache -> ProbabilityCache deleteDocument doc probabilityCache = fromProbabilityCache' $ deleteDocument' doc (toProbabilityCache' probabilityCache) replaceDocument :: D.Document -> D.Document -> ProbabilityCache -> ProbabilityCache replaceDocument oldDoc newDoc probabilityCache = fromProbabilityCache' $ (addDocument' newDoc . deleteDocument' oldDoc) (toProbabilityCache' probabilityCache) addDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache' addDocument' doc probabilityCache = combining (+) probabilityCache (fromDocument doc) deleteDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache' deleteDocument' doc probabilityCache = combining (-) probabilityCache (fromDocument doc) combining :: (Int -> Int -> Int) -> ProbabilityCache' -> ProbabilityCache' -> ProbabilityCache' combining f probabilityCache probabilityCache' = ProbabilityCache' {..} where docsPerWord = M.unionWith f probabilityCache.docsPerWord probabilityCache'.docsPerWord docsPerTag = M.unionWith f probabilityCache.docsPerTag probabilityCache'.docsPerTag docsPerWordAndTag = M.unionWith f probabilityCache.docsPerWordAndTag probabilityCache'.docsPerWordAndTag fromDocument :: D.Document -> ProbabilityCache' fromDocument doc = let words = S.toList doc.index.originalWords tags = S.toList doc.index.tags wordAndTags = [(word, tag) | word <- words, tag <- tags] in ProbabilityCache' { docsPerWord = docsPer words, docsPerTag = docsPer tags, docsPerWordAndTag = docsPer wordAndTags } where docsPer :: Ord k => [k] -> M.Map k Int docsPer = M.fromList . map (flip (,) 1)