summaryrefslogtreecommitdiffstats
path: root/app/Document.hs
blob: fa9f33febe8f5eac21c4e296bb0d0e3c2f55ba3b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE StrictData #-}

module Document
  ( Document (..),
    Index (..),
    hasWord,
    hasTag,
    hasWordAndTag,
    tagValues,
    getDocuments,
    readDocument,
  )
where

import Control.DeepSeq (NFData (rnf))
import Control.Exception (Exception, throwIO)
import Data.Aeson qualified as J
import Data.ByteString.Lazy qualified as LB
import Data.List (sort)
import Data.Map qualified as M
import Data.Maybe (mapMaybe, maybeToList)
import Data.Set qualified as S
import Data.Text qualified as T
import Data.Time.Clock (UTCTime)
import Data.Time.Format.ISO8601 (iso8601Show)
import GHC.Generics (Generic)
import GHC.Records (HasField (..))
import Parallel (parMapM)
import System.Directory (listDirectory)
import System.FilePath (takeBaseName, (<.>), (</>))
import Tag qualified as G

-- TODO Inline `Index`

-- TODO Replace `iFileName` with `id`
data Document = Document
  { iFileName :: String,
    index :: Index
  }
  deriving (Show)

instance NFData Document where
  rnf (Document {..}) = rnf iFileName `seq` rnf index

instance HasField "oFilePath" Document FilePath where
  getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"

instance HasField "iFilePath" Document FilePath where
  getField doc = "index" </> doc.iFileName

hasWord :: T.Text -> Document -> Bool
hasWord word doc = S.member word doc.index.originalWords

hasTag :: G.Tag -> Document -> Bool
hasTag tag doc = S.member tag doc.index.tags

hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc

tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
tagValues docs =
  M.unionsWith S.union $
    mapMaybe
      ( \tag ->
          M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
      )
      (S.toList (S.unions (map (.index.tags) docs)))

getDocuments :: IO [Document]
getDocuments =
  parMapM readDocument
    =<< sort <$> listDirectory "index"

readDocument :: FilePath -> IO Document
readDocument iFileName =
  Document iFileName
    <$> decodeFile @Index ("index" </> iFileName)

data Index = Index
  { content :: T.Text,
    tags :: S.Set G.Tag,
    addedAt :: UTCTime,
    modifiedAt :: Maybe UTCTime,
    todo :: Bool,
    language :: String
  }
  deriving (Show, Generic, Eq)

instance NFData Index where
  rnf (Index {..}) =
    rnf content `seq`
      rnf tags `seq`
        rnf addedAt `seq`
          rnf modifiedAt `seq`
            rnf todo `seq`
              rnf language

instance J.ToJSON Index

instance J.FromJSON Index

instance HasField "shortText" Index T.Text where
  getField =
    (T.unlines . take 10 . T.lines)
      . (.content)

instance HasField "internalTags" Index (S.Set G.Tag) where
  getField index =
    index.tags `S.union` internalTags index

-- TODO Cache `originalWords`
--
-- @related cache-probabilitymap
instance HasField "originalWords" Index (S.Set T.Text) where
  getField index =
    S.fromList (T.words index.content)

internalTags :: Index -> S.Set G.Tag
internalTags index =
  S.fromList
    ( concat
        [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
            G.tag "language" (Just (T.pack index.language))
          ],
          maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
          if index.todo then [G.tag "todo" Nothing] else []
        ]
    )

data DecodeException = DecodeException FilePath String
  deriving (Show)

instance Exception DecodeException

decodeFile :: J.FromJSON a => FilePath -> IO a
decodeFile fp =
  either (throwIO . DecodeException fp) pure . J.eitherDecode
    =<< LB.readFile fp