From 0d96613d9aa41f93ebb440bb1aa383456b49f28f Mon Sep 17 00:00:00 2001 From: Alexander Foremny Date: Fri, 15 Dec 2023 07:40:33 +0100 Subject: feat: drop haskell-tree-sitter Drop haskell-tree-sitter in favor of custom bindings to tree-sitter. haskell-tree-sitter is outdated and seems unmaintained. The implementation add low-level bindings to tree-sitter and traverses the AST in Haskell. We suspect that many FFI calls are more expensive than performing just a single API call to a C function that does the traversal. This will be addressed in upcoming commits. @prerequisite-for add-languages-elm-shell-nix --- app/Comment.hs | 80 ++++++++++++++++++++++++++++++------------------- app/Comment/Language.hs | 3 +- app/TreeSitter.hs | 65 ++++++++++++++++++++++++++++++++++++++++ app/TreeSitter/bridge.c | 34 +++++++++++++++++++++ 4 files changed, 149 insertions(+), 33 deletions(-) create mode 100644 app/TreeSitter.hs create mode 100644 app/TreeSitter/bridge.c (limited to 'app') diff --git a/app/Comment.hs b/app/Comment.hs index 7bd4ad5..63f610a 100644 --- a/app/Comment.hs +++ b/app/Comment.hs @@ -25,16 +25,13 @@ import Data.Text.Encoding qualified as T import Exception qualified as E import Foreign.C.String import Foreign.Marshal.Alloc (free, malloc) -import Foreign.Marshal.Array (mallocArray, peekArray) -import Foreign.Ptr (nullPtr) +import Foreign.Ptr (Ptr, nullPtr) import Foreign.Storable import GHC.Generics (Generic) import GHC.Int (Int64) import Git qualified import System.FilePath (takeExtension) -import TreeSitter.Node qualified as S -import TreeSitter.Parser qualified as S -import TreeSitter.Tree qualified as S +import TreeSitter qualified as S data Comment = Comment { text :: T.Text, @@ -96,51 +93,72 @@ extractComments :: FilePath -> Language -> B.ByteString -> IO [Comment] extractComments filePath language str' = do S.withParser (parser language) $ \parser -> do B.useAsCStringLen str' $ \(str, len) -> do - tree <- S.ts_parser_parse_string parser nullPtr str len - S.withRootNode tree $ \node -> do - map - ( \n' -> - let startByte = fromIntegral $ S.nodeStartByte n' - endByte = fromIntegral $ S.nodeEndByte n' - text = + tree <- S.ts_parser_parse_string parser nullPtr str (fromIntegral len) + node <- malloc + S.ts_tree_root_node tree node + x <- + mapM + ( \n' -> do + startByte <- fromIntegral <$> S.ts_node_start_byte n' + endByte <- fromIntegral <$> S.ts_node_end_byte n' + let text = T.decodeUtf8 . B.take (fromIntegral endByte - fromIntegral startByte) . B.drop (fromIntegral startByte) $ str' - startPoint = fromTSPoint (S.nodeStartPoint n') - endPoint = fromTSPoint (S.nodeEndPoint n') - fromTSPoint (S.TSPoint {..}) = Point (fromIntegral pointRow + 1) (fromIntegral pointColumn + 1) - in Comment {..} + startPoint <- do + point <- malloc + S.ts_node_start_point n' point + S.Point {..} <- peek point + free point + pure + Point + { row = fromIntegral row, + column = fromIntegral column + } + endPoint <- do + point <- malloc + S.ts_node_end_point n' point + S.Point {..} <- peek point + free point + pure + Point + { row = fromIntegral row, + column = fromIntegral column + } + + pure Comment {..} ) - <$> (commentsFromNodeRec language =<< peek node) + =<< (commentsFromNodeRec language node) + free node + pure x -commentsFromNodeRec :: Language -> S.Node -> IO [S.Node] +commentsFromNodeRec :: Language -> Ptr S.Node -> IO [Ptr S.Node] commentsFromNodeRec language = (filterM (isCommentNode language) =<<) . childNodesFromNodeRec -isCommentNode :: Language -> S.Node -> IO Bool +isCommentNode :: Language -> Ptr S.Node -> IO Bool isCommentNode language n = - (`elem` (nodeTypes language)) <$> peekCString (S.nodeType n) + fmap (`elem` (nodeTypes language)) . peekCString =<< S.ts_node_type n -childNodesFromNodeRec :: S.Node -> IO [S.Node] +childNodesFromNodeRec :: Ptr S.Node -> IO [Ptr S.Node] childNodesFromNodeRec n = do ns' <- childNodesFromNode n ns <- concat <$> mapM childNodesFromNodeRec ns' pure $ n : ns -childNodesFromNode :: S.Node -> IO [S.Node] +childNodesFromNode :: Ptr S.Node -> IO [Ptr S.Node] childNodesFromNode n = do - let numChildren = fromIntegral (S.nodeChildCount n) - ns <- mallocArray numChildren - tsNode <- malloc - poke tsNode (S.nodeTSNode n) - S.ts_node_copy_child_nodes tsNode ns - free tsNode - ns' <- peekArray numChildren ns - free ns - pure ns' + numChildren <- fromIntegral <$> S.ts_node_named_child_count n + mapM + ( \k -> do + node <- malloc + S.ts_node_named_child n k node + pure node + ) + [0 .. numChildren - 1] data CommentStyle = LineStyle T.Text diff --git a/app/Comment/Language.hs b/app/Comment/Language.hs index 67c3413..925cf56 100644 --- a/app/Comment/Language.hs +++ b/app/Comment/Language.hs @@ -14,8 +14,7 @@ import Data.Text qualified as T import Exception qualified as E import Foreign.Ptr (Ptr) import GHC.Generics (Generic) -import TreeSitter.Haskell qualified as S -import TreeSitter.Language qualified as S +import TreeSitter qualified as S data Language = Haskell diff --git a/app/TreeSitter.hs b/app/TreeSitter.hs new file mode 100644 index 0000000..e911d1b --- /dev/null +++ b/app/TreeSitter.hs @@ -0,0 +1,65 @@ +module TreeSitter where + +-- | References: [tree-sitter/api.h](https://github.com/tree-sitter/tree-sitter/blob/master/lib/include/tree_sitter/api.h) + +import Foreign.C.String (CString) +import Foreign.C.Types (CInt (..)) +import Foreign.Ptr (Ptr) +import Foreign.Storable (Storable (..), peek) + +data Parser + +data Language + +data Tree = Tree + +data Node = Node + +instance Storable Node where + sizeOf _ = 32 + +data Point = Point + { row :: CInt, + column :: CInt + } deriving (Show) + +instance Storable Point where + sizeOf _ = 8 + alignment _ = 8 + peek p = Point <$> peekByteOff p 0 <*> peekByteOff p 4 + +withParser :: Ptr Language -> (Ptr Parser -> IO a) -> IO a +withParser l f = do + p <- ts_parser_new + ts_parser_set_language p l + x <- f p + ts_parser_delete p + pure x + +foreign import ccall unsafe "ts_node_start_point_p" ts_node_start_point :: Ptr Node -> Ptr Point -> IO () + +foreign import ccall unsafe "ts_node_end_point_p" ts_node_end_point :: Ptr Node -> Ptr Point -> IO () + +foreign import ccall unsafe "ts_node_start_byte_p" ts_node_start_byte :: Ptr Node -> IO CInt + +foreign import ccall unsafe "ts_node_end_byte_p" ts_node_end_byte :: Ptr Node -> IO CInt + +foreign import ccall unsafe "ts_node_type_p" ts_node_type :: Ptr Node -> IO CString + +foreign import ccall unsafe "ts_node_named_child_p" ts_node_named_child :: Ptr Node -> CInt -> Ptr Node -> IO () + +foreign import ccall unsafe "ts_node_named_child_count_p" ts_node_named_child_count :: Ptr Node -> IO CInt + +foreign import ccall unsafe "ts_tree_root_node_p" ts_tree_root_node :: Ptr Tree -> Ptr Node -> IO () + +foreign import ccall unsafe "ts_tree_delete" ts_tree_delete :: Ptr Tree -> IO () + +foreign import ccall unsafe "ts_parser_parse_string" ts_parser_parse_string :: Ptr Parser -> Ptr Tree -> CString -> CInt -> IO (Ptr Tree) + +foreign import ccall unsafe "ts_parser_new" ts_parser_new :: IO (Ptr Parser) + +foreign import ccall unsafe "ts_parser_delete" ts_parser_delete :: Ptr Parser -> IO () + +foreign import ccall unsafe "ts_parser_set_language" ts_parser_set_language :: Ptr Parser -> Ptr Language -> IO () + +foreign import ccall unsafe "tree_sitter_haskell" tree_sitter_haskell :: Ptr Language diff --git a/app/TreeSitter/bridge.c b/app/TreeSitter/bridge.c new file mode 100644 index 0000000..904c88e --- /dev/null +++ b/app/TreeSitter/bridge.c @@ -0,0 +1,34 @@ +#include "tree_sitter/api.h" +#include "string.h" + +void ts_tree_root_node_p(TSTree *tree, TSNode *node) { + (*node) = ts_tree_root_node(tree); +} + +uint32_t ts_node_named_child_count_p(TSNode *node) { + return ts_node_named_child_count(*node); +} + +uint32_t ts_node_start_byte_p(TSNode *node) { + return ts_node_start_byte(*node); +} + +uint32_t ts_node_end_byte_p(TSNode *node) { + return ts_node_end_byte(*node); +} + +uint32_t ts_node_start_point_p(TSNode *node, TSPoint *point) { + (*point) = ts_node_start_point(*node); +} + +uint32_t ts_node_end_point_p(TSNode *node, TSPoint *point) { + (*point) = ts_node_end_point(*node); +} + +const char* ts_node_type_p(TSNode *node) { + return ts_node_type(*node); +} + +void ts_node_named_child_p(TSNode* self, uint32_t child_index, TSNode* node) { + (*node) = ts_node_named_child(*self, child_index); +} -- cgit v1.2.3