1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StrictData #-}
module Document
( Document (..),
Index (..),
hasWord,
hasTag,
hasWordAndTag,
tagValues,
getDocuments,
readDocument,
)
where
import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throwIO)
import Data.Aeson qualified as J
import Data.ByteString.Lazy qualified as LB
import Data.List (sort)
import Data.Map qualified as M
import Data.Maybe (mapMaybe, maybeToList)
import Data.Set qualified as S
import Data.Text qualified as T
import Data.Time.Clock (UTCTime)
import Data.Time.Format.ISO8601 (iso8601Show)
import GHC.Conc (getNumProcessors)
import GHC.Generics (Generic)
import GHC.Records (HasField (..))
import System.Directory (listDirectory)
import System.FilePath (takeBaseName, (<.>), (</>))
import Tag qualified as G
-- TODO Inline `Index`
-- TODO Replace `iFileName` with `id`
data Document = Document
{ iFileName :: String,
index :: Index
}
deriving (Show)
instance HasField "oFilePath" Document FilePath where
getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"
instance HasField "iFilePath" Document FilePath where
getField doc = "index" </> doc.iFileName
hasWord :: T.Text -> Document -> Bool
hasWord word doc = S.member word doc.index.originalWords
hasTag :: G.Tag -> Document -> Bool
hasTag tag doc = S.member tag doc.index.tags
hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc
tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
tagValues docs =
M.unionsWith S.union $
mapMaybe
( \tag ->
M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
)
(S.toList (S.unions (map (.index.tags) docs)))
getDocuments :: IO [Document]
getDocuments =
parMapM readDocument
=<< sort <$> listDirectory "index"
readDocument :: FilePath -> IO Document
readDocument iFileName =
Document iFileName
<$> decodeFile @Index ("index" </> iFileName)
data Index = Index
{ content :: T.Text,
tags :: S.Set G.Tag,
addedAt :: UTCTime,
modifiedAt :: Maybe UTCTime,
todo :: Bool,
language :: String
}
deriving (Show, Generic, Eq)
instance J.ToJSON Index
instance J.FromJSON Index
instance HasField "shortText" Index T.Text where
getField =
(T.unlines . take 10 . T.lines)
. (.content)
instance HasField "internalTags" Index (S.Set G.Tag) where
getField index =
index.tags `S.union` internalTags index
-- TODO Cache `originalWords`
--
-- @related cache-probabilitymap
instance HasField "originalWords" Index (S.Set T.Text) where
getField index =
S.fromList (T.words index.content)
internalTags :: Index -> S.Set G.Tag
internalTags index =
S.fromList
( concat
[ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
G.tag "language" (Just (T.pack index.language))
],
maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
if index.todo then [G.tag "todo" Nothing] else []
]
)
parMapM :: (a -> IO b) -> [a] -> IO [b]
parMapM f xs = do
n <- getNumProcessors
withPool n $ \pool -> parallel pool (map f xs)
data DecodeException = DecodeException FilePath String
deriving (Show)
instance Exception DecodeException
decodeFile :: J.FromJSON a => FilePath -> IO a
decodeFile fp =
either (throwIO . DecodeException fp) pure . J.eitherDecode
=<< LB.readFile fp
|