summaryrefslogtreecommitdiffstats
path: root/app/ProbabilityMap.hs
diff options
context:
space:
mode:
Diffstat (limited to 'app/ProbabilityMap.hs')
-rw-r--r--app/ProbabilityMap.hs230
1 files changed, 230 insertions, 0 deletions
diff --git a/app/ProbabilityMap.hs b/app/ProbabilityMap.hs
new file mode 100644
index 0000000..9a700d7
--- /dev/null
+++ b/app/ProbabilityMap.hs
@@ -0,0 +1,230 @@
+{-# LANGUAGE DeriveAnyClass #-}
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedRecordDot #-}
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE StrictData #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE NoFieldSelectors #-}
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+
+module ProbabilityMap
+ ( fileName,
+ ProbabilityCache (..),
+ readProbabilityCache,
+ writeProbabilityCache,
+ modifyProbabilityCache,
+ fromDocuments,
+ addDocument,
+ deleteDocument,
+ replaceDocument,
+ )
+where
+
+import Control.Applicative (liftA2)
+import Control.Arrow ((***))
+import Control.DeepSeq (NFData (..))
+import Control.Exception (SomeException, try)
+import Control.Parallel.Strategies
+import Data.Binary qualified as B
+import Data.List (foldl')
+import Data.Map qualified as M
+import Data.Maybe (fromMaybe)
+import Data.Set qualified as S
+import Data.Text qualified as T
+import Data.Vector qualified as V
+import Data.Vector.Binary ()
+import Document qualified as D
+import GHC.Conc (numCapabilities)
+import GHC.Generics (Generic)
+import Tag qualified as G
+
+data ProbabilityCache = ProbabilityCache
+ { docsPerWord :: M.Map T.Text Int,
+ docsPerTag :: M.Map G.Tag Int,
+ docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int,
+ probabilityMap :: M.Map (T.Text, G.Tag) Double
+ }
+ deriving (Show, Generic)
+
+data ProbabilityCache' = ProbabilityCache'
+ { docsPerWord :: M.Map T.Text Int,
+ docsPerTag :: M.Map G.Tag Int,
+ docsPerWordAndTag :: M.Map (T.Text, G.Tag) Int
+ }
+ deriving (Show, Generic)
+
+instance NFData ProbabilityCache' where
+ rnf (ProbabilityCache' {..}) =
+ rnf docsPerWord `seq`
+ rnf docsPerTag `seq`
+ rnf docsPerWordAndTag
+
+fromProbabilityCache' :: ProbabilityCache' -> ProbabilityCache
+fromProbabilityCache' probabilityCache'@(ProbabilityCache' {..}) =
+ ProbabilityCache {probabilityMap = probabilityMap probabilityCache', ..}
+
+toProbabilityCache' :: ProbabilityCache -> ProbabilityCache'
+toProbabilityCache' (ProbabilityCache {..}) =
+ ProbabilityCache' {..}
+
+instance B.Binary ProbabilityCache where
+ get = fromCacheRep <$> B.get
+ put = B.put . toCacheRep
+
+instance NFData ProbabilityCache where
+ rnf (ProbabilityCache {..}) =
+ rnf docsPerWord `seq`
+ rnf docsPerTag `seq`
+ rnf docsPerWordAndTag `seq`
+ rnf probabilityMap
+
+data CacheRep = CacheRep
+ { allWords :: V.Vector T.Text,
+ allTags :: V.Vector G.Tag,
+ docsPerWord :: M.Map Int Int,
+ docsPerTag :: M.Map Int Int,
+ docsPerWordAndTag :: M.Map (Int, Int) Int,
+ probabilityMap :: M.Map (Int, Int) Double
+ }
+ deriving (Show, Generic, B.Binary)
+
+instance NFData CacheRep where
+ rnf (CacheRep {..}) =
+ rnf allWords `seq`
+ rnf allTags `seq`
+ rnf docsPerWord `seq`
+ rnf docsPerTag `seq`
+ rnf docsPerWordAndTag `seq`
+ rnf probabilityMap
+
+fromCacheRep :: CacheRep -> ProbabilityCache
+fromCacheRep (CacheRep {..}) =
+ ProbabilityCache
+ { docsPerWord = M.mapKeys fromWord docsPerWord,
+ docsPerTag = M.mapKeys fromTag docsPerTag,
+ docsPerWordAndTag = M.mapKeys fromWordAndTag docsPerWordAndTag,
+ probabilityMap = M.mapKeys fromWordAndTag probabilityMap
+ }
+ where
+ fromWord = (allWords V.!)
+ fromTag = (allTags V.!)
+ fromWordAndTag = fromWord *** fromTag
+
+toCacheRep :: ProbabilityCache -> CacheRep
+toCacheRep (ProbabilityCache {..}) =
+ CacheRep
+ { allWords = V.fromList allWords',
+ allTags = V.fromList allTags',
+ docsPerWord = M.mapKeys toWord docsPerWord,
+ docsPerTag = M.mapKeys toTag docsPerTag,
+ docsPerWordAndTag = M.mapKeys toWordAndTag docsPerWordAndTag,
+ probabilityMap = M.mapKeys toWordAndTag probabilityMap
+ }
+ where
+ allWords' = M.keys docsPerWord
+ allTags' = M.keys docsPerTag
+ allWordsMap = M.fromList $ zip allWords' [0 ..]
+ allTagsMap = M.fromList $ zip allTags' [0 ..]
+ toWord = (allWordsMap M.!)
+ toTag = (allTagsMap M.!)
+ toWordAndTag = toWord *** toTag
+
+fileName :: FilePath
+fileName = "probabilityCache"
+
+readProbabilityCache :: IO ProbabilityCache
+readProbabilityCache =
+ try (B.decodeFile fileName) >>= \case
+ Left (_ :: SomeException) -> fromDocuments <$> D.getDocuments
+ Right probabilityCache -> pure probabilityCache
+
+writeProbabilityCache :: ProbabilityCache -> IO ()
+writeProbabilityCache probabilityCache =
+ B.encodeFile fileName probabilityCache
+
+modifyProbabilityCache :: (ProbabilityCache -> ProbabilityCache) -> IO ProbabilityCache
+modifyProbabilityCache f =
+ liftA2 (>>) (writeProbabilityCache . f) pure =<< readProbabilityCache
+
+fromDocuments :: [D.Document] -> ProbabilityCache
+fromDocuments allDocs =
+ fromProbabilityCache' $
+ foldl' (flip addDocument') (ProbabilityCache' M.empty M.empty M.empty) allDocs
+
+probabilityMap :: ProbabilityCache' -> M.Map (T.Text, G.Tag) Double
+probabilityMap (ProbabilityCache' {..}) =
+ M.unions
+ . withStrategy (parList rdeepseq)
+ . map
+ ( M.fromList
+ . filter (\(_, p) -> p > 0)
+ . map (\k@(word, tag) -> (k, probability word tag))
+ )
+ $ streamsOf
+ numCapabilities
+ [ (word, tag)
+ | word <- allWords,
+ tag <- allTags
+ ]
+ where
+ probability word tag =
+ let docsWithWord = docsPerWord M.! word
+ docsWithTag = docsPerTag M.! tag
+ docsWithWordAndTag = fromMaybe 0 $ M.lookup (word, tag) docsPerWordAndTag
+ in fi docsWithWordAndTag / fi (docsWithWord + docsWithTag - docsWithWordAndTag)
+
+ allWords = M.keys docsPerWord
+ allTags = M.keys docsPerTag
+ fi = fromIntegral @Int @Double
+
+streamsOf :: Int -> [a] -> [[a]]
+streamsOf 1 xs = [xs]
+streamsOf n xs
+ | n > 0 = [everyN k xs | k <- [0 .. n - 1]]
+ | otherwise = []
+ where
+ everyN k xs = map snd $ filter ((== k) . (`mod` n) . fst) $ zip [0 ..] xs
+
+addDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
+addDocument doc probabilityCache =
+ fromProbabilityCache' $ addDocument' doc (toProbabilityCache' probabilityCache)
+
+deleteDocument :: D.Document -> ProbabilityCache -> ProbabilityCache
+deleteDocument doc probabilityCache =
+ fromProbabilityCache' $ deleteDocument' doc (toProbabilityCache' probabilityCache)
+
+replaceDocument :: D.Document -> D.Document -> ProbabilityCache -> ProbabilityCache
+replaceDocument oldDoc newDoc probabilityCache =
+ fromProbabilityCache' $
+ (addDocument' newDoc . deleteDocument' oldDoc) (toProbabilityCache' probabilityCache)
+
+addDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache'
+addDocument' doc probabilityCache =
+ combining (+) probabilityCache (fromDocument doc)
+
+deleteDocument' :: D.Document -> ProbabilityCache' -> ProbabilityCache'
+deleteDocument' doc probabilityCache =
+ combining (-) probabilityCache (fromDocument doc)
+
+combining :: (Int -> Int -> Int) -> ProbabilityCache' -> ProbabilityCache' -> ProbabilityCache'
+combining f probabilityCache probabilityCache' =
+ ProbabilityCache' {..}
+ where
+ docsPerWord = M.unionWith f probabilityCache.docsPerWord probabilityCache'.docsPerWord
+ docsPerTag = M.unionWith f probabilityCache.docsPerTag probabilityCache'.docsPerTag
+ docsPerWordAndTag = M.unionWith f probabilityCache.docsPerWordAndTag probabilityCache'.docsPerWordAndTag
+
+fromDocument :: D.Document -> ProbabilityCache'
+fromDocument doc =
+ let words = S.toList doc.index.originalWords
+ tags = S.toList doc.index.tags
+ wordAndTags = [(word, tag) | word <- words, tag <- tags]
+ in ProbabilityCache'
+ { docsPerWord = docsPer words,
+ docsPerTag = docsPer tags,
+ docsPerWordAndTag = docsPer wordAndTags
+ }
+ where
+ docsPer :: Ord k => [k] -> M.Map k Int
+ docsPer = M.fromList . map (flip (,) 1)