aboutsummaryrefslogtreecommitdiffstats
path: root/app/History
diff options
context:
space:
mode:
authorLibravatar Alexander Foremny <aforemny@posteo.de>2023-12-18 13:50:22 +0100
committerLibravatar Alexander Foremny <aforemny@posteo.de>2024-03-25 07:42:51 +0100
commitfc0afaaa273f5b5d3696df87d70d5347a13bb9ac (patch)
treea7e48842f71511f39a367e5dff84f41c02f3d859 /app/History
parent812fcbadae72960d200286355c9aaecfbe350bf2 (diff)
feat: compute history top to bottom
Disables caching.
Diffstat (limited to 'app/History')
-rw-r--r--app/History/IssueEvents.hs183
-rw-r--r--app/History/Issues.hs72
-rw-r--r--app/History/Plan.hs207
-rw-r--r--app/History/Scramble.hs76
4 files changed, 538 insertions, 0 deletions
diff --git a/app/History/IssueEvents.hs b/app/History/IssueEvents.hs
new file mode 100644
index 0000000..176d660
--- /dev/null
+++ b/app/History/IssueEvents.hs
@@ -0,0 +1,183 @@
+module History.IssueEvents
+ ( IssueEvents (..),
+ )
+where
+
+import Data.Binary (Binary)
+import Data.Function (on)
+import Data.List
+import Data.List.NonEmpty qualified as NE
+import Data.Map qualified as M
+import Data.Maybe (fromMaybe)
+import Data.Ord (comparing)
+import Data.Proxy (Proxy)
+import GHC.Generics (Generic)
+import Git qualified
+import History.Plan (Id, Planable, Proto, assume, propagate, protoOf)
+import History.Scramble (Scramble (..), getIssuesOfFile)
+import Issue (Issue (..))
+import IssueEvent (IssueEvent (..))
+import IssueEvent qualified as E
+
+data IssueEvents = IssueEvents
+ { commitHash :: Git.CommitHash,
+ issueEvents :: [E.IssueEvent]
+ }
+ deriving (Show, Generic, Binary)
+
+instance Planable IssueEvents where
+ type Id IssueEvents = Git.CommitHash
+ type Proto IssueEvents = Scramble
+ protoOf :: Proxy IssueEvents -> Git.CommitHash -> IO Scramble
+ protoOf _ commitHash@Git.WorkingTree = do
+ filesChanged <- Git.getFilesOf commitHash
+ issues <- concat <$> mapM (getIssuesOfFile commitHash) filesChanged
+ pure $
+ Scramble
+ { issues =
+ M.unions
+ [ M.singleton issue.id issue | issue <- issues
+ ],
+ ..
+ }
+ protoOf _ commitHash@(Git.Commit _) = do
+ filesChanged <- Git.getChangedFilesOf commitHash
+ issues <- concat <$> mapM (getIssuesOfFile commitHash) filesChanged
+ pure $
+ Scramble
+ { issues =
+ M.unions
+ [ M.singleton issue.id issue | issue <- issues
+ ],
+ ..
+ }
+
+ assume :: Scramble -> IssueEvents
+ assume Scramble {..} =
+ IssueEvents
+ { issueEvents =
+ [ IssueCreated commitHash issue
+ | issue <- M.elems issues
+ ],
+ ..
+ }
+
+ propagate ::
+ [Git.CommitHash] ->
+ IssueEvents ->
+ Scramble ->
+ IssueEvents
+ propagate = propagateIssueEvents
+
+-- | Propagates issue events from a top commit to a consecutive bottom commit.
+--
+-- Because we are building the history of issue events backwards, we use *top* and *bottom* to refer to the later and earlier commit within the Git history, respectively. We use *earlier* and *later* to refer to the top and bottom commits within the application of this function. In particular, top commits are processed earlier, and bottom commits are processed later.
+--
+-- For the top/earlier commit, issue events are known. For the bottom/later commit, only the scramble is known.
+--
+-- The scramble is said to *witness* an issue, if its changed files contain the issue's file. Thus, in addition to the issues a scramble contains, it witnesses the issues that have been deleted in the scramble's commit.
+--
+-- Because we are building the history from top/earlier to bottom/later commits, we have to assume that at any top/earlier commit, the issues present have been created in that commit. This function advances the issue's original commit as we learn about bottom/later commits from scrambles.
+--
+-- A scramble *advances* an issue's original commit if contains it or does not witness it. In particular, a scramble does NOT advance an issue's original commit, if it *just witnesses* it (ie. witnesses it, but does not contain it). To understand this, consider the following scenarios (A) and (B). Suppose the top/ earlier commit is `commit-c`, and the bottom/ later commit is `commit-a`. Between them, the commit `commit-b` is intermediary. Suppose for scenario (A) that `issue-1` is created in `commit-a`, deleted in `commit-b` and re-opened in `commit-c`. Suppose for scenario (B) that `issue-1` is originally created in `commit-c`. Then, at `commit-b` the following holds true:
+--
+-- - The issue `issue-1` has been created in `commit-c`, because it is present.
+-- - The issue `issue-1 is not present in `commit-b`.
+-- - The scramble of `commit-b` can witness `issue-1` in both scenarios (A), (B) even though it does not contain it.
+--
+-- Thus, we cannot decide at `commit-b` whether `issue-1` has been initially created in the top/ earlier commit `commit-c`, or whether is has been re-opened in `commit-c`. We need the information of presence in `commit-a` to decide this.
+--
+-- The most confusing edge case it the re-opening of issues. Suppose `issue-1` has been created in `commit-a`, deleted in `commit-b` and re-opened in `commit-c`. Contrast this with the scenario that `issue-1` has initially been created in `commit-c`. Observe the following as we propagate events from `commmit-c` to `commit-a`:
+--
+-- - At `commit-c`: `issue-1` is present in both scenarios, thus it is initially tracked as "created" in `commit-c`.
+-- - At `commit-b`: `issue-1` is NOT present in both scenarios. Note that the scramble can witness `issue-1` in either cases.
+-- - At `commit-a`: `issue-1` is present in the first scenario, but not in the second.
+--
+-- So, in the case `issue-1` has been re-opened, we cannot track its deletion at `commit-b`, because whether it was re-opened or originally created at `commit-b` depends on whether `issue-1` is present in the bottom `commit-a`, that we process only later. Thus, the scramble of commit `commit-b` cannot safely advance the issue's original commit, and we use this information to track re-opening of commits at later commits.
+--
+-- Note that in the whole process, issue change events and issue deletion events can never be bottom-most/latest events, as they would depend on information not yet known, ie. the first commit can neither change nor delete an issue.
+propagateIssueEvents :: [Git.CommitHash] -> IssueEvents -> Scramble -> IssueEvents
+propagateIssueEvents log topIssueEvents bottomScramble =
+ IssueEvents
+ { commitHash = bottomScramble.commitHash,
+ issueEvents =
+ sortOn logOrder $
+ let issueEventsPerIssue =
+ groupPerIssue topIssueEvents.issueEvents
+ in concat
+ [ -- CASE 1. The issue is present in the top/earlier history and bottom/later scramble. We can safely advance the issue's original commit.
+ concat
+ [ NE.init issueEvents ++ case NE.last issueEvents of
+ (IssueCreated commitHash issue) ->
+ if commitHash /= topCommitHash
+ then
+ [ IssueCreated commitHash issue,
+ IssueDeleted topCommitHash issue,
+ IssueCreated bottomCommitHash bottomIssue
+ ]
+ else
+ concat
+ [ if topIssue `neq` bottomIssue
+ then [IssueChanged commitHash bottomIssue issue]
+ else [],
+ [IssueCreated bottomCommitHash bottomIssue]
+ ]
+ _ -> error "bottom issues can only be created"
+ | issueEvents <- issueEventsPerIssue,
+ let topIssue = issue (NE.last issueEvents),
+ bottomIssue <- bottomIssues,
+ bottomIssue `sym` topIssue
+ ],
+ concat
+ [ -- CASE 2. The issue is present in the top/earlier history, not contained in the bottom/later scramble, but witnessed by the bottom/later scramble. We cannot safely advance the issue's original commit.
+ NE.init issueEvents ++ case NE.last issueEvents of
+ (IssueCreated commitHash issue) -> [IssueCreated commitHash issue]
+ _ -> error "bottom issues can only be created"
+ | issueEvents <- issueEventsPerIssue,
+ let topIssue = issue (NE.last issueEvents),
+ all (`nsym` topIssue) bottomIssues,
+ topIssue.file `elem` bottomScramble.filesChanged
+ ],
+ concat
+ [ -- CASE 3. The issue is present in the top/earlier history, but not witnessed by the bottom/later scramble. It is safe to advance the issue's commit hash.
+ NE.init issueEvents ++ case NE.last issueEvents of
+ (IssueCreated _ issue) -> [IssueCreated bottomCommitHash issue]
+ _ -> error "bottom issues can only be created"
+ | issueEvents <- issueEventsPerIssue,
+ let topIssue = issue (NE.last issueEvents),
+ all (`nsym` topIssue) bottomIssues,
+ not (topIssue.file `elem` bottomScramble.filesChanged)
+ ],
+ concat
+ [ -- CASE 4. The issue it not present in the top/earlier history, but contained in the bottom scramble. It had to be deleted by the top/ earlier commit.
+ [ IssueDeleted topIssueEvents.commitHash bottomIssue,
+ IssueCreated bottomCommitHash bottomIssue
+ ]
+ | bottomIssue <- bottomIssues,
+ all ((`nsym` bottomIssue) . issue . NE.last) issueEventsPerIssue
+ ]
+ ]
+ }
+ where
+ groupPerIssue =
+ map (NE.sortBy (comparing logOrder)) . NE.groupBy (sym `on` issue) . sortOn ((.id) . issue)
+
+ topCommitHash = topIssueEvents.commitHash
+ bottomCommitHash = bottomScramble.commitHash
+
+ bottomIssues = M.elems bottomScramble.issues
+
+ neq, sym, nsym :: Issue -> Issue -> Bool
+ sym = (==) `on` (.id)
+ nsym = (/=) `on` (.id)
+ neq = (/=) `on` (.rawTextHash)
+
+ commitHash (IssueChanged commitHash _ _) = commitHash
+ commitHash (IssueCreated commitHash _) = commitHash
+ commitHash (IssueDeleted commitHash _) = commitHash
+
+ issue (IssueChanged _ _ issue) = issue
+ issue (IssueCreated _ issue) = issue
+ issue (IssueDeleted _ issue) = issue
+
+ logOrder = fromMaybe (-1) . (`elemIndex` log) . commitHash
diff --git a/app/History/Issues.hs b/app/History/Issues.hs
new file mode 100644
index 0000000..08ad772
--- /dev/null
+++ b/app/History/Issues.hs
@@ -0,0 +1,72 @@
+module History.Issues
+ ( Issues (..),
+ )
+where
+
+import Data.Binary (Binary)
+import Data.Function (on)
+import Data.Map qualified as M
+import Data.Proxy (Proxy)
+import Data.Text qualified as T
+import GHC.Generics (Generic)
+import Git qualified
+import History.Plan (Id, Planable, Proto, assume, propagate, protoOf)
+import History.Scramble (Scramble (..), getIssuesOfFile)
+import Issue qualified as I
+
+data Issues = Issues
+ { commitHash :: Git.CommitHash,
+ issues :: M.Map T.Text I.Issue
+ }
+ deriving (Show, Generic, Binary)
+
+instance Planable Issues where
+ type Id Issues = Git.CommitHash
+ type Proto Issues = Scramble
+ protoOf :: Proxy Issues -> Git.CommitHash -> IO Scramble
+ protoOf _ commitHash = do
+ filesChanged <- Git.getChangedFilesOf commitHash
+ issues <- concat <$> mapM (getIssuesOfFile commitHash) filesChanged
+ pure $
+ Scramble
+ { issues =
+ M.unions
+ [ M.singleton issue.id issue | issue <- issues
+ ],
+ ..
+ }
+
+ assume :: Scramble -> Issues
+ assume (Scramble {..}) = Issues {..}
+
+ propagate :: [Id Issues] -> Issues -> Scramble -> Issues
+ propagate _ topIssues bottomScramble =
+ Issues
+ { commitHash = topIssues.commitHash,
+ issues =
+ M.mergeWithKey
+ ( \_ topIssue bottomIssue ->
+ Just $
+ topIssue
+ { I.provenance =
+ I.Provenance
+ { first = bottomIssue.provenance.first,
+ last =
+ if ((/=) `on` (.rawTextHash)) topIssue bottomIssue
+ then topIssue.provenance.last
+ else bottomIssue.provenance.last
+ }
+ }
+ )
+ ( \topIssues -> topIssues
+ )
+ ( \bottomIssues ->
+ M.map
+ ( \bottomIssue ->
+ bottomIssue {I.closed = True}
+ )
+ bottomIssues
+ )
+ topIssues.issues
+ bottomScramble.issues
+ }
diff --git a/app/History/Plan.hs b/app/History/Plan.hs
new file mode 100644
index 0000000..3dec391
--- /dev/null
+++ b/app/History/Plan.hs
@@ -0,0 +1,207 @@
+module History.Plan
+ ( Planable (..),
+ formulate,
+ realise,
+ )
+where
+
+import Control.Concurrent (forkIO, killThread)
+import Control.Concurrent.STM (TQueue, TVar, atomically, newTQueueIO, newTVarIO, readTQueue, readTVar, retry, writeTQueue, writeTVar)
+import Control.Monad (forever)
+import Data.Kind (Type)
+import Data.List (intercalate)
+import Data.List.NonEmpty qualified as NE
+import Data.Map qualified as M
+import Data.Proxy (Proxy (Proxy))
+import Data.Time.Clock
+import Parallel (parMapM_)
+import System.IO (hFlush, hPutStr, stderr, stdout)
+import System.Time.Monotonic (Clock, clockGetTime, newClock)
+import Text.Printf (printf)
+
+-- | Left-associative monadic fold of a structure with automatic parallelism, status-reporting and intermittent results (cacelability).
+--
+-- Semantically, `realise (formulate xs)` is equivalent to
+--
+-- ```haskell
+-- \(x:xs) -> foldM propagate (assume <$> protoOf x) =<< mapM protoOf xs
+-- ```
+--
+-- However, this module provides the following features:
+--
+-- - `protoOf` and `propagate` are computed in parallel,
+-- - `propagate` is scheduled to terminate as soon as possible,
+--
+-- This makes it possible to scale the computation across multiple cores, but still interrupt the process for intermittent results.
+--
+-- Additionally, a progress report is presented on stdout.
+class Planable output where
+ type Id output :: Type
+ type Proto output :: Type
+ protoOf :: Proxy output -> Id output -> IO (Proto output)
+ assume :: Proto output -> output
+ propagate :: [Id output] -> output -> Proto output -> output
+
+data Plan output = Plan (NE.NonEmpty (Id output)) [Task output]
+
+data Task output
+ = Compute (Id output)
+ | Propagate (Maybe (Id output)) (Id output)
+
+isCompute, isPropagate :: Task output -> Bool
+isCompute (Compute _) = True
+isCompute _ = False
+isPropagate (Propagate _ _) = True
+isPropagate _ = False
+
+formulate ::
+ Planable output =>
+ Proxy output ->
+ NE.NonEmpty (Id output) ->
+ Plan output
+formulate _ ids@(NE.uncons -> (id, (maybe [] NE.toList -> rest))) =
+ Plan ids $
+ Compute id
+ : Propagate Nothing id
+ : concat
+ ( zipWith
+ ( \id id' ->
+ [ Compute id,
+ Propagate id' id
+ ]
+ )
+ rest
+ (map Just (id : rest))
+ )
+
+data State output = State
+ { tasks :: [Task output],
+ protos :: M.Map (Id output) (Proto output),
+ outputs :: M.Map (Id output) output,
+ elapsed :: Clock
+ }
+
+realise :: (Ord (Id output), Planable output) => Plan output -> IO output
+realise plan@(Plan ids tasks) = do
+ elapsed <- newClock
+ let state0 =
+ State
+ { tasks = tasks,
+ protos = M.empty,
+ outputs = M.empty,
+ elapsed = elapsed
+ }
+ stateT <- newTVarIO state0
+ statusT <- newTQueueIO
+ tid <- forkIO $ forever do
+ state <- atomically (readTQueue statusT)
+ hPutStr stderr . printStatus =<< status state0 state
+ hFlush stdout
+ parMapM_ (step Proxy plan statusT stateT) tasks
+ let id = NE.last ids
+ output <- atomically $ do
+ maybe retry pure . M.lookup id . (.outputs) =<< readTVar stateT
+ killThread tid
+ pure output
+
+step ::
+ (Ord (Id output), Planable output) =>
+ Proxy output ->
+ Plan output ->
+ TQueue (State output) ->
+ TVar (State output) ->
+ Task output ->
+ IO ()
+step proxy _ statusT stateT (Compute id) = do
+ proto <- protoOf proxy id
+ atomically do
+ state <- readTVar stateT
+ let state' = state {protos = M.insert id proto state.protos}
+ writeTVar stateT state'
+ writeTQueue statusT state'
+ pure ()
+step _ (Plan ids _) statusT stateT (Propagate (Just id') id) = do
+ (output, proto) <- atomically do
+ state <- readTVar stateT
+ output <- maybe retry pure (M.lookup id' state.outputs)
+ proto <- maybe retry pure (M.lookup id state.protos)
+ pure (output, proto)
+ let output' = propagate (NE.toList ids) output proto
+ atomically do
+ state <- readTVar stateT
+ let state' = state {outputs = M.insert id output' state.outputs}
+ writeTVar stateT state'
+ writeTQueue statusT state'
+step _ _ statusT stateT (Propagate Nothing id) = do
+ proto <- atomically do
+ state <- readTVar stateT
+ maybe retry pure (M.lookup id state.protos)
+ let output = assume proto
+ atomically do
+ state <- readTVar stateT
+ let state' = state {outputs = M.insert id output state.outputs}
+ writeTVar stateT state'
+ writeTQueue statusT state'
+
+data Status = Status
+ { numTasks :: Progress Int,
+ numProtos :: Progress Int,
+ numOutputs :: Progress Int,
+ elapsed :: DiffTime
+ }
+ deriving (Show, Eq)
+
+data Progress a = Progress
+ { total :: a,
+ completed :: a
+ }
+ deriving (Show, Eq)
+
+status :: State output -> State output -> IO Status
+status state0 state = do
+ let totalTasks = length state0.tasks
+ totalProtos = length (filter isCompute state0.tasks)
+ totalOutputs = length (filter isPropagate state0.tasks)
+ completedTasks = completedProtos + completedOutputs
+ completedProtos = M.size state.protos
+ completedOutputs = M.size state.outputs
+ elapsed <- clockGetTime state.elapsed
+ pure
+ Status
+ { numTasks = Progress totalTasks completedTasks,
+ numProtos = Progress totalProtos completedProtos,
+ numOutputs = Progress totalOutputs completedOutputs,
+ elapsed = elapsed
+ }
+
+printStatus :: Status -> String
+printStatus Status {..} = do
+ let formatProgress completed total = pad total completed <> "/" <> total
+ pad total completed = replicate (length total - length completed) ' ' <> completed
+ eta =
+ formatEta
+ ( (fromIntegral numTasks.total * (realToFrac elapsed))
+ / fromIntegral numTasks.completed
+ )
+ (<> "\r") . intercalate " " $
+ [ formatProgress (show numTasks.completed) (show numTasks.total),
+ formatProgress (show numProtos.completed) (show numProtos.total),
+ formatProgress (show numOutputs.completed) (show numOutputs.total),
+ "ETA " <> eta
+ ]
+
+formatEta :: Double -> String
+formatEta s = do
+ if s < 60
+ then printf "%.1fs" s
+ else do
+ let m = s / 60
+ if m < 60
+ then printf "%.1fm" m
+ else do
+ let h = m / 60
+ if h < 24
+ then printf "%.1hf" h
+ else do
+ let d = h / 24
+ printf "%.1fd" d
diff --git a/app/History/Scramble.hs b/app/History/Scramble.hs
new file mode 100644
index 0000000..9004dbf
--- /dev/null
+++ b/app/History/Scramble.hs
@@ -0,0 +1,76 @@
+module History.Scramble
+ ( Scramble (..),
+ getIssuesOfFile,
+ fromComment,
+ )
+where
+
+import CMark qualified as D
+import Comment qualified as G
+import Control.Exception (Handler (..), catches)
+import Data.Binary (Binary)
+import Data.Digest.Pure.SHA qualified as S
+import Data.Map qualified as M
+import Data.Maybe (catMaybes)
+import Data.Text qualified as T
+import Data.Text.Lazy qualified as LT
+import Data.Text.Lazy.Encoding qualified as LT
+import Exception qualified as E
+import GHC.Generics (Generic)
+import Git qualified
+import Issue qualified as I
+import Issue.Parser qualified as I
+import Issue.Tag qualified as I
+import Issue.Text qualified as I
+import Parallel (parMapM)
+import Render qualified as P
+
+-- | `Scramble` records the complete issues ONLY in files that have
+-- been changed in the commit.
+data Scramble = Scramble
+ { commitHash :: Git.CommitHash,
+ filesChanged :: [FilePath],
+ issues :: M.Map T.Text I.Issue
+ }
+ deriving (Show, Binary, Generic)
+
+-- | Get all issues in the given directory and file.
+getIssuesOfFile :: Git.CommitHash -> FilePath -> IO [I.Issue]
+getIssuesOfFile commitHash filename =
+ ( fmap catMaybes . parMapM (fromComment commitHash)
+ =<< G.getComments commitHash filename
+ )
+ `catches` [ Handler \(_ :: E.UnknownFile) -> pure [],
+ Handler \(_ :: E.UnsupportedLanguage) -> pure []
+ ]
+
+-- | Note that `provenance` is trivial and needs to be fixed up later.
+fromComment :: Git.CommitHash -> G.Comment -> IO (Maybe I.Issue)
+fromComment commitHash comment = do
+ commit <- Git.getCommitOf commitHash
+ let provenance = I.Provenance commit commit
+
+ pure $
+ ( \parseResult ->
+ let (markers, title) =
+ I.stripIssueMarkers (T.pack (show (P.render parseResult.heading)))
+ in I.Issue
+ { commitHash = commitHash,
+ language = comment.language,
+ rawTextHash = S.sha1 (LT.encodeUtf8 (LT.fromStrict rawText)),
+ title = title,
+ file = comment.filePath,
+ provenance = provenance,
+ startByte = comment.startByte,
+ endByte = comment.endByte,
+ startPoint = comment.startPoint,
+ endPoint = comment.endPoint,
+ tags = I.extractTags parseResult.tags,
+ markers = markers,
+ commentStyle = commentStyle,
+ closed = False
+ }
+ )
+ <$> I.parse I.issueMarkers (D.commonmarkToNode [] rawText)
+ where
+ (commentStyle, rawText) = G.uncomment comment.language comment.text