summaryrefslogtreecommitdiffstats
path: root/app/Document.hs
blob: ac8a73b747b8b8272c037d749629cf9a747914ec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StrictData #-}

module Document
  ( Document (..),
    Index (..),
    hasWord,
    hasTag,
    hasWordAndTag,
    tagValues,
    getDocuments,
    readDocument,
  )
where

import Control.Concurrent.ParallelIO.Local (parallel, withPool)
import Control.Exception (Exception, throwIO)
import Data.Aeson qualified as J
import Data.ByteString.Lazy qualified as LB
import Data.List (sort)
import Data.Map qualified as M
import Data.Maybe (mapMaybe, maybeToList)
import Data.Set qualified as S
import Data.Text qualified as T
import Data.Time.Clock (UTCTime)
import Data.Time.Format.ISO8601 (iso8601Show)
import GHC.Conc (getNumProcessors)
import GHC.Generics (Generic)
import GHC.Records (HasField (..))
import System.Directory (listDirectory)
import System.FilePath (takeBaseName, (<.>), (</>))
import Tag qualified as G

-- TODO Inline `Index`

-- TODO Replace `iFileName` with `id`
data Document = Document
  { iFileName :: String,
    index :: Index
  }
  deriving (Show)

instance HasField "oFilePath" Document FilePath where
  getField doc = "originals" </> takeBaseName doc.iFileName <.> "pdf"

instance HasField "iFilePath" Document FilePath where
  getField doc = "index" </> doc.iFileName

hasWord :: T.Text -> Document -> Bool
hasWord word doc = S.member word doc.index.originalWords

hasTag :: G.Tag -> Document -> Bool
hasTag tag doc = S.member tag doc.index.tags

hasWordAndTag :: T.Text -> G.Tag -> Document -> Bool
hasWordAndTag word tag doc = hasTag tag doc && hasWord word doc

tagValues :: [Document] -> M.Map T.Text (S.Set T.Text)
tagValues docs =
  M.unionsWith S.union $
    mapMaybe
      ( \tag ->
          M.singleton (G.tagKey tag) . S.singleton <$> (G.tagValue tag)
      )
      (S.toList (S.unions (map (.index.tags) docs)))

getDocuments :: IO [Document]
getDocuments =
  parMapM readDocument
    =<< sort <$> listDirectory "index"

readDocument :: FilePath -> IO Document
readDocument iFileName =
  Document iFileName
    <$> decodeFile @Index ("index" </> iFileName)

data Index = Index
  { content :: T.Text,
    tags :: S.Set G.Tag,
    addedAt :: UTCTime,
    modifiedAt :: Maybe UTCTime,
    todo :: Bool,
    language :: String
  }
  deriving (Show, Generic, Eq)

instance J.ToJSON Index

instance J.FromJSON Index

instance HasField "shortText" Index T.Text where
  getField =
    (T.unlines . take 10 . T.lines)
      . (.content)

instance HasField "internalTags" Index (S.Set G.Tag) where
  getField index =
    index.tags `S.union` internalTags index

-- TODO Cache `originalWords`
--
-- @related cache-probabilitymap
instance HasField "originalWords" Index (S.Set T.Text) where
  getField index =
    S.fromList (T.words index.content)

internalTags :: Index -> S.Set G.Tag
internalTags index =
  S.fromList
    ( concat
        [ [ G.tag "addedAt" (Just (T.pack (iso8601Show index.addedAt))),
            G.tag "language" (Just (T.pack index.language))
          ],
          maybeToList (G.tag "modifiedAt" . Just . T.pack . iso8601Show <$> index.modifiedAt),
          if index.todo then [G.tag "todo" Nothing] else []
        ]
    )

parMapM :: (a -> IO b) -> [a] -> IO [b]
parMapM f xs = do
  n <- getNumProcessors
  withPool n $ \pool -> parallel pool (map f xs)

data DecodeException = DecodeException FilePath String
  deriving (Show)

instance Exception DecodeException

decodeFile :: J.FromJSON a => FilePath -> IO a
decodeFile fp =
  either (throwIO . DecodeException fp) pure . J.eitherDecode
    =<< LB.readFile fp