diff options
author | Alexander Foremny <aforemny@posteo.de> | 2023-12-18 13:50:22 +0100 |
---|---|---|
committer | Alexander Foremny <aforemny@posteo.de> | 2024-03-25 07:42:51 +0100 |
commit | fc0afaaa273f5b5d3696df87d70d5347a13bb9ac (patch) | |
tree | a7e48842f71511f39a367e5dff84f41c02f3d859 /app/History | |
parent | 812fcbadae72960d200286355c9aaecfbe350bf2 (diff) |
feat: compute history top to bottom
Disables caching.
Diffstat (limited to 'app/History')
-rw-r--r-- | app/History/IssueEvents.hs | 183 | ||||
-rw-r--r-- | app/History/Issues.hs | 72 | ||||
-rw-r--r-- | app/History/Plan.hs | 207 | ||||
-rw-r--r-- | app/History/Scramble.hs | 76 |
4 files changed, 538 insertions, 0 deletions
diff --git a/app/History/IssueEvents.hs b/app/History/IssueEvents.hs new file mode 100644 index 0000000..176d660 --- /dev/null +++ b/app/History/IssueEvents.hs @@ -0,0 +1,183 @@ +module History.IssueEvents + ( IssueEvents (..), + ) +where + +import Data.Binary (Binary) +import Data.Function (on) +import Data.List +import Data.List.NonEmpty qualified as NE +import Data.Map qualified as M +import Data.Maybe (fromMaybe) +import Data.Ord (comparing) +import Data.Proxy (Proxy) +import GHC.Generics (Generic) +import Git qualified +import History.Plan (Id, Planable, Proto, assume, propagate, protoOf) +import History.Scramble (Scramble (..), getIssuesOfFile) +import Issue (Issue (..)) +import IssueEvent (IssueEvent (..)) +import IssueEvent qualified as E + +data IssueEvents = IssueEvents + { commitHash :: Git.CommitHash, + issueEvents :: [E.IssueEvent] + } + deriving (Show, Generic, Binary) + +instance Planable IssueEvents where + type Id IssueEvents = Git.CommitHash + type Proto IssueEvents = Scramble + protoOf :: Proxy IssueEvents -> Git.CommitHash -> IO Scramble + protoOf _ commitHash@Git.WorkingTree = do + filesChanged <- Git.getFilesOf commitHash + issues <- concat <$> mapM (getIssuesOfFile commitHash) filesChanged + pure $ + Scramble + { issues = + M.unions + [ M.singleton issue.id issue | issue <- issues + ], + .. + } + protoOf _ commitHash@(Git.Commit _) = do + filesChanged <- Git.getChangedFilesOf commitHash + issues <- concat <$> mapM (getIssuesOfFile commitHash) filesChanged + pure $ + Scramble + { issues = + M.unions + [ M.singleton issue.id issue | issue <- issues + ], + .. + } + + assume :: Scramble -> IssueEvents + assume Scramble {..} = + IssueEvents + { issueEvents = + [ IssueCreated commitHash issue + | issue <- M.elems issues + ], + .. + } + + propagate :: + [Git.CommitHash] -> + IssueEvents -> + Scramble -> + IssueEvents + propagate = propagateIssueEvents + +-- | Propagates issue events from a top commit to a consecutive bottom commit. +-- +-- Because we are building the history of issue events backwards, we use *top* and *bottom* to refer to the later and earlier commit within the Git history, respectively. We use *earlier* and *later* to refer to the top and bottom commits within the application of this function. In particular, top commits are processed earlier, and bottom commits are processed later. +-- +-- For the top/earlier commit, issue events are known. For the bottom/later commit, only the scramble is known. +-- +-- The scramble is said to *witness* an issue, if its changed files contain the issue's file. Thus, in addition to the issues a scramble contains, it witnesses the issues that have been deleted in the scramble's commit. +-- +-- Because we are building the history from top/earlier to bottom/later commits, we have to assume that at any top/earlier commit, the issues present have been created in that commit. This function advances the issue's original commit as we learn about bottom/later commits from scrambles. +-- +-- A scramble *advances* an issue's original commit if contains it or does not witness it. In particular, a scramble does NOT advance an issue's original commit, if it *just witnesses* it (ie. witnesses it, but does not contain it). To understand this, consider the following scenarios (A) and (B). Suppose the top/ earlier commit is `commit-c`, and the bottom/ later commit is `commit-a`. Between them, the commit `commit-b` is intermediary. Suppose for scenario (A) that `issue-1` is created in `commit-a`, deleted in `commit-b` and re-opened in `commit-c`. Suppose for scenario (B) that `issue-1` is originally created in `commit-c`. Then, at `commit-b` the following holds true: +-- +-- - The issue `issue-1` has been created in `commit-c`, because it is present. +-- - The issue `issue-1 is not present in `commit-b`. +-- - The scramble of `commit-b` can witness `issue-1` in both scenarios (A), (B) even though it does not contain it. +-- +-- Thus, we cannot decide at `commit-b` whether `issue-1` has been initially created in the top/ earlier commit `commit-c`, or whether is has been re-opened in `commit-c`. We need the information of presence in `commit-a` to decide this. +-- +-- The most confusing edge case it the re-opening of issues. Suppose `issue-1` has been created in `commit-a`, deleted in `commit-b` and re-opened in `commit-c`. Contrast this with the scenario that `issue-1` has initially been created in `commit-c`. Observe the following as we propagate events from `commmit-c` to `commit-a`: +-- +-- - At `commit-c`: `issue-1` is present in both scenarios, thus it is initially tracked as "created" in `commit-c`. +-- - At `commit-b`: `issue-1` is NOT present in both scenarios. Note that the scramble can witness `issue-1` in either cases. +-- - At `commit-a`: `issue-1` is present in the first scenario, but not in the second. +-- +-- So, in the case `issue-1` has been re-opened, we cannot track its deletion at `commit-b`, because whether it was re-opened or originally created at `commit-b` depends on whether `issue-1` is present in the bottom `commit-a`, that we process only later. Thus, the scramble of commit `commit-b` cannot safely advance the issue's original commit, and we use this information to track re-opening of commits at later commits. +-- +-- Note that in the whole process, issue change events and issue deletion events can never be bottom-most/latest events, as they would depend on information not yet known, ie. the first commit can neither change nor delete an issue. +propagateIssueEvents :: [Git.CommitHash] -> IssueEvents -> Scramble -> IssueEvents +propagateIssueEvents log topIssueEvents bottomScramble = + IssueEvents + { commitHash = bottomScramble.commitHash, + issueEvents = + sortOn logOrder $ + let issueEventsPerIssue = + groupPerIssue topIssueEvents.issueEvents + in concat + [ -- CASE 1. The issue is present in the top/earlier history and bottom/later scramble. We can safely advance the issue's original commit. + concat + [ NE.init issueEvents ++ case NE.last issueEvents of + (IssueCreated commitHash issue) -> + if commitHash /= topCommitHash + then + [ IssueCreated commitHash issue, + IssueDeleted topCommitHash issue, + IssueCreated bottomCommitHash bottomIssue + ] + else + concat + [ if topIssue `neq` bottomIssue + then [IssueChanged commitHash bottomIssue issue] + else [], + [IssueCreated bottomCommitHash bottomIssue] + ] + _ -> error "bottom issues can only be created" + | issueEvents <- issueEventsPerIssue, + let topIssue = issue (NE.last issueEvents), + bottomIssue <- bottomIssues, + bottomIssue `sym` topIssue + ], + concat + [ -- CASE 2. The issue is present in the top/earlier history, not contained in the bottom/later scramble, but witnessed by the bottom/later scramble. We cannot safely advance the issue's original commit. + NE.init issueEvents ++ case NE.last issueEvents of + (IssueCreated commitHash issue) -> [IssueCreated commitHash issue] + _ -> error "bottom issues can only be created" + | issueEvents <- issueEventsPerIssue, + let topIssue = issue (NE.last issueEvents), + all (`nsym` topIssue) bottomIssues, + topIssue.file `elem` bottomScramble.filesChanged + ], + concat + [ -- CASE 3. The issue is present in the top/earlier history, but not witnessed by the bottom/later scramble. It is safe to advance the issue's commit hash. + NE.init issueEvents ++ case NE.last issueEvents of + (IssueCreated _ issue) -> [IssueCreated bottomCommitHash issue] + _ -> error "bottom issues can only be created" + | issueEvents <- issueEventsPerIssue, + let topIssue = issue (NE.last issueEvents), + all (`nsym` topIssue) bottomIssues, + not (topIssue.file `elem` bottomScramble.filesChanged) + ], + concat + [ -- CASE 4. The issue it not present in the top/earlier history, but contained in the bottom scramble. It had to be deleted by the top/ earlier commit. + [ IssueDeleted topIssueEvents.commitHash bottomIssue, + IssueCreated bottomCommitHash bottomIssue + ] + | bottomIssue <- bottomIssues, + all ((`nsym` bottomIssue) . issue . NE.last) issueEventsPerIssue + ] + ] + } + where + groupPerIssue = + map (NE.sortBy (comparing logOrder)) . NE.groupBy (sym `on` issue) . sortOn ((.id) . issue) + + topCommitHash = topIssueEvents.commitHash + bottomCommitHash = bottomScramble.commitHash + + bottomIssues = M.elems bottomScramble.issues + + neq, sym, nsym :: Issue -> Issue -> Bool + sym = (==) `on` (.id) + nsym = (/=) `on` (.id) + neq = (/=) `on` (.rawTextHash) + + commitHash (IssueChanged commitHash _ _) = commitHash + commitHash (IssueCreated commitHash _) = commitHash + commitHash (IssueDeleted commitHash _) = commitHash + + issue (IssueChanged _ _ issue) = issue + issue (IssueCreated _ issue) = issue + issue (IssueDeleted _ issue) = issue + + logOrder = fromMaybe (-1) . (`elemIndex` log) . commitHash diff --git a/app/History/Issues.hs b/app/History/Issues.hs new file mode 100644 index 0000000..08ad772 --- /dev/null +++ b/app/History/Issues.hs @@ -0,0 +1,72 @@ +module History.Issues + ( Issues (..), + ) +where + +import Data.Binary (Binary) +import Data.Function (on) +import Data.Map qualified as M +import Data.Proxy (Proxy) +import Data.Text qualified as T +import GHC.Generics (Generic) +import Git qualified +import History.Plan (Id, Planable, Proto, assume, propagate, protoOf) +import History.Scramble (Scramble (..), getIssuesOfFile) +import Issue qualified as I + +data Issues = Issues + { commitHash :: Git.CommitHash, + issues :: M.Map T.Text I.Issue + } + deriving (Show, Generic, Binary) + +instance Planable Issues where + type Id Issues = Git.CommitHash + type Proto Issues = Scramble + protoOf :: Proxy Issues -> Git.CommitHash -> IO Scramble + protoOf _ commitHash = do + filesChanged <- Git.getChangedFilesOf commitHash + issues <- concat <$> mapM (getIssuesOfFile commitHash) filesChanged + pure $ + Scramble + { issues = + M.unions + [ M.singleton issue.id issue | issue <- issues + ], + .. + } + + assume :: Scramble -> Issues + assume (Scramble {..}) = Issues {..} + + propagate :: [Id Issues] -> Issues -> Scramble -> Issues + propagate _ topIssues bottomScramble = + Issues + { commitHash = topIssues.commitHash, + issues = + M.mergeWithKey + ( \_ topIssue bottomIssue -> + Just $ + topIssue + { I.provenance = + I.Provenance + { first = bottomIssue.provenance.first, + last = + if ((/=) `on` (.rawTextHash)) topIssue bottomIssue + then topIssue.provenance.last + else bottomIssue.provenance.last + } + } + ) + ( \topIssues -> topIssues + ) + ( \bottomIssues -> + M.map + ( \bottomIssue -> + bottomIssue {I.closed = True} + ) + bottomIssues + ) + topIssues.issues + bottomScramble.issues + } diff --git a/app/History/Plan.hs b/app/History/Plan.hs new file mode 100644 index 0000000..3dec391 --- /dev/null +++ b/app/History/Plan.hs @@ -0,0 +1,207 @@ +module History.Plan + ( Planable (..), + formulate, + realise, + ) +where + +import Control.Concurrent (forkIO, killThread) +import Control.Concurrent.STM (TQueue, TVar, atomically, newTQueueIO, newTVarIO, readTQueue, readTVar, retry, writeTQueue, writeTVar) +import Control.Monad (forever) +import Data.Kind (Type) +import Data.List (intercalate) +import Data.List.NonEmpty qualified as NE +import Data.Map qualified as M +import Data.Proxy (Proxy (Proxy)) +import Data.Time.Clock +import Parallel (parMapM_) +import System.IO (hFlush, hPutStr, stderr, stdout) +import System.Time.Monotonic (Clock, clockGetTime, newClock) +import Text.Printf (printf) + +-- | Left-associative monadic fold of a structure with automatic parallelism, status-reporting and intermittent results (cacelability). +-- +-- Semantically, `realise (formulate xs)` is equivalent to +-- +-- ```haskell +-- \(x:xs) -> foldM propagate (assume <$> protoOf x) =<< mapM protoOf xs +-- ``` +-- +-- However, this module provides the following features: +-- +-- - `protoOf` and `propagate` are computed in parallel, +-- - `propagate` is scheduled to terminate as soon as possible, +-- +-- This makes it possible to scale the computation across multiple cores, but still interrupt the process for intermittent results. +-- +-- Additionally, a progress report is presented on stdout. +class Planable output where + type Id output :: Type + type Proto output :: Type + protoOf :: Proxy output -> Id output -> IO (Proto output) + assume :: Proto output -> output + propagate :: [Id output] -> output -> Proto output -> output + +data Plan output = Plan (NE.NonEmpty (Id output)) [Task output] + +data Task output + = Compute (Id output) + | Propagate (Maybe (Id output)) (Id output) + +isCompute, isPropagate :: Task output -> Bool +isCompute (Compute _) = True +isCompute _ = False +isPropagate (Propagate _ _) = True +isPropagate _ = False + +formulate :: + Planable output => + Proxy output -> + NE.NonEmpty (Id output) -> + Plan output +formulate _ ids@(NE.uncons -> (id, (maybe [] NE.toList -> rest))) = + Plan ids $ + Compute id + : Propagate Nothing id + : concat + ( zipWith + ( \id id' -> + [ Compute id, + Propagate id' id + ] + ) + rest + (map Just (id : rest)) + ) + +data State output = State + { tasks :: [Task output], + protos :: M.Map (Id output) (Proto output), + outputs :: M.Map (Id output) output, + elapsed :: Clock + } + +realise :: (Ord (Id output), Planable output) => Plan output -> IO output +realise plan@(Plan ids tasks) = do + elapsed <- newClock + let state0 = + State + { tasks = tasks, + protos = M.empty, + outputs = M.empty, + elapsed = elapsed + } + stateT <- newTVarIO state0 + statusT <- newTQueueIO + tid <- forkIO $ forever do + state <- atomically (readTQueue statusT) + hPutStr stderr . printStatus =<< status state0 state + hFlush stdout + parMapM_ (step Proxy plan statusT stateT) tasks + let id = NE.last ids + output <- atomically $ do + maybe retry pure . M.lookup id . (.outputs) =<< readTVar stateT + killThread tid + pure output + +step :: + (Ord (Id output), Planable output) => + Proxy output -> + Plan output -> + TQueue (State output) -> + TVar (State output) -> + Task output -> + IO () +step proxy _ statusT stateT (Compute id) = do + proto <- protoOf proxy id + atomically do + state <- readTVar stateT + let state' = state {protos = M.insert id proto state.protos} + writeTVar stateT state' + writeTQueue statusT state' + pure () +step _ (Plan ids _) statusT stateT (Propagate (Just id') id) = do + (output, proto) <- atomically do + state <- readTVar stateT + output <- maybe retry pure (M.lookup id' state.outputs) + proto <- maybe retry pure (M.lookup id state.protos) + pure (output, proto) + let output' = propagate (NE.toList ids) output proto + atomically do + state <- readTVar stateT + let state' = state {outputs = M.insert id output' state.outputs} + writeTVar stateT state' + writeTQueue statusT state' +step _ _ statusT stateT (Propagate Nothing id) = do + proto <- atomically do + state <- readTVar stateT + maybe retry pure (M.lookup id state.protos) + let output = assume proto + atomically do + state <- readTVar stateT + let state' = state {outputs = M.insert id output state.outputs} + writeTVar stateT state' + writeTQueue statusT state' + +data Status = Status + { numTasks :: Progress Int, + numProtos :: Progress Int, + numOutputs :: Progress Int, + elapsed :: DiffTime + } + deriving (Show, Eq) + +data Progress a = Progress + { total :: a, + completed :: a + } + deriving (Show, Eq) + +status :: State output -> State output -> IO Status +status state0 state = do + let totalTasks = length state0.tasks + totalProtos = length (filter isCompute state0.tasks) + totalOutputs = length (filter isPropagate state0.tasks) + completedTasks = completedProtos + completedOutputs + completedProtos = M.size state.protos + completedOutputs = M.size state.outputs + elapsed <- clockGetTime state.elapsed + pure + Status + { numTasks = Progress totalTasks completedTasks, + numProtos = Progress totalProtos completedProtos, + numOutputs = Progress totalOutputs completedOutputs, + elapsed = elapsed + } + +printStatus :: Status -> String +printStatus Status {..} = do + let formatProgress completed total = pad total completed <> "/" <> total + pad total completed = replicate (length total - length completed) ' ' <> completed + eta = + formatEta + ( (fromIntegral numTasks.total * (realToFrac elapsed)) + / fromIntegral numTasks.completed + ) + (<> "\r") . intercalate " " $ + [ formatProgress (show numTasks.completed) (show numTasks.total), + formatProgress (show numProtos.completed) (show numProtos.total), + formatProgress (show numOutputs.completed) (show numOutputs.total), + "ETA " <> eta + ] + +formatEta :: Double -> String +formatEta s = do + if s < 60 + then printf "%.1fs" s + else do + let m = s / 60 + if m < 60 + then printf "%.1fm" m + else do + let h = m / 60 + if h < 24 + then printf "%.1hf" h + else do + let d = h / 24 + printf "%.1fd" d diff --git a/app/History/Scramble.hs b/app/History/Scramble.hs new file mode 100644 index 0000000..9004dbf --- /dev/null +++ b/app/History/Scramble.hs @@ -0,0 +1,76 @@ +module History.Scramble + ( Scramble (..), + getIssuesOfFile, + fromComment, + ) +where + +import CMark qualified as D +import Comment qualified as G +import Control.Exception (Handler (..), catches) +import Data.Binary (Binary) +import Data.Digest.Pure.SHA qualified as S +import Data.Map qualified as M +import Data.Maybe (catMaybes) +import Data.Text qualified as T +import Data.Text.Lazy qualified as LT +import Data.Text.Lazy.Encoding qualified as LT +import Exception qualified as E +import GHC.Generics (Generic) +import Git qualified +import Issue qualified as I +import Issue.Parser qualified as I +import Issue.Tag qualified as I +import Issue.Text qualified as I +import Parallel (parMapM) +import Render qualified as P + +-- | `Scramble` records the complete issues ONLY in files that have +-- been changed in the commit. +data Scramble = Scramble + { commitHash :: Git.CommitHash, + filesChanged :: [FilePath], + issues :: M.Map T.Text I.Issue + } + deriving (Show, Binary, Generic) + +-- | Get all issues in the given directory and file. +getIssuesOfFile :: Git.CommitHash -> FilePath -> IO [I.Issue] +getIssuesOfFile commitHash filename = + ( fmap catMaybes . parMapM (fromComment commitHash) + =<< G.getComments commitHash filename + ) + `catches` [ Handler \(_ :: E.UnknownFile) -> pure [], + Handler \(_ :: E.UnsupportedLanguage) -> pure [] + ] + +-- | Note that `provenance` is trivial and needs to be fixed up later. +fromComment :: Git.CommitHash -> G.Comment -> IO (Maybe I.Issue) +fromComment commitHash comment = do + commit <- Git.getCommitOf commitHash + let provenance = I.Provenance commit commit + + pure $ + ( \parseResult -> + let (markers, title) = + I.stripIssueMarkers (T.pack (show (P.render parseResult.heading))) + in I.Issue + { commitHash = commitHash, + language = comment.language, + rawTextHash = S.sha1 (LT.encodeUtf8 (LT.fromStrict rawText)), + title = title, + file = comment.filePath, + provenance = provenance, + startByte = comment.startByte, + endByte = comment.endByte, + startPoint = comment.startPoint, + endPoint = comment.endPoint, + tags = I.extractTags parseResult.tags, + markers = markers, + commentStyle = commentStyle, + closed = False + } + ) + <$> I.parse I.issueMarkers (D.commonmarkToNode [] rawText) + where + (commentStyle, rawText) = G.uncomment comment.language comment.text |