Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced java parsing support #210

Merged
merged 6 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions app/modules/code_provider/github/github_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,18 @@ async def get_user_repos(
):
user_repo_list = await GithubController(db).get_user_repos(user=user)
user_repo_list["repositories"].extend(config_provider.get_demo_repo_list())

# Remove duplicates while preserving order
seen = set()
deduped_repos = []
for repo in reversed(user_repo_list["repositories"]):
# Create tuple of values to use as hash key
repo_key = repo["full_name"]


if repo_key not in seen:
seen.add(repo_key)
deduped_repos.append(repo)

user_repo_list["repositories"] = deduped_repos
return user_repo_list

Expand Down
54 changes: 35 additions & 19 deletions app/modules/parsing/graph_construction/code_graph_service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import hashlib
import logging
import time
from typing import Dict, Optional

from neo4j import GraphDatabase
Expand Down Expand Up @@ -43,10 +44,8 @@ def create_and_store_graph(self, repo_dir, project_id, user_id):
nx_graph = self.repo_map.create_graph(repo_dir)

with self.driver.session() as session:
# Create nodes
import time

start_time = time.time() # Start timing
start_time = time.time()
node_count = nx_graph.number_of_nodes()
logging.info(f"Creating {node_count} nodes")

Expand All @@ -55,25 +54,42 @@ def create_and_store_graph(self, repo_dir, project_id, user_id):
for i in range(0, node_count, batch_size):
batch_nodes = list(nx_graph.nodes(data=True))[i : i + batch_size]
nodes_to_create = []
for node in batch_nodes:
node_type = node[1].get("type")
label = node_type.capitalize() if node_type else "UNKNOWN"
node_data = {
"name": node[0],
"file_path": node[1].get("file", ""),
"start_line": node[1].get("line", -1),
"end_line": node[1].get("end_line", -1),

for node_id, node_data in batch_nodes:
# Get the node type and ensure it's one of our expected types
node_type = node_data.get("type", "UNKNOWN")
if node_type == "UNKNOWN":
continue
# Initialize labels with NODE
labels = ["NODE"]

# Add specific type label if it's a valid type
if node_type in ["FILE", "CLASS", "FUNCTION", "INTERFACE"]:
labels.append(node_type)

# Prepare node data
processed_node = {
"name": node_data.get(
"name", node_id
), # Use node_id as fallback
"file_path": node_data.get("file", ""),
"start_line": node_data.get("line", -1),
"end_line": node_data.get("end_line", -1),
"repoId": project_id,
"node_id": CodeGraphService.generate_node_id(node[0], user_id),
"node_id": CodeGraphService.generate_node_id(node_id, user_id),
"entityId": user_id,
"type": node_type if node_type else "Unknown",
"text": node[1].get("text", ""),
"labels": ["NODE", label],
"type": node_type,
"text": node_data.get("text", ""),
"labels": labels,
}

# Remove None values
processed_node = {
k: v for k, v in processed_node.items() if v is not None
}
# Remove any null values from node_data
node_data = {k: v for k, v in node_data.items() if v is not None}
nodes_to_create.append(node_data)
nodes_to_create.append(processed_node)

# Create nodes with labels
session.run(
"""
UNWIND $nodes AS node
Expand Down Expand Up @@ -112,7 +128,7 @@ def create_and_store_graph(self, repo_dir, project_id, user_id):
edges=edges_to_create,
)

end_time = time.time() # End timing
end_time = time.time()
logging.info(
f"Time taken to create graph and search index: {end_time - start_time:.2f} seconds"
)
Expand Down
27 changes: 17 additions & 10 deletions app/modules/parsing/graph_construction/parsing_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ async def parse_directory(
project = await project_manager.get_project_from_db(
repo_name, repo_details.branch_name, user_id
)

# First check if this is a demo project that hasn't been accessed by this user yet
if not project and repo_details.repo_name in demo_repos:
existing_project = await project_manager.get_global_project_from_db(
Expand Down Expand Up @@ -134,24 +134,28 @@ async def parse_directory(
project_manager,
db,
)

# Handle existing projects (including previously duplicated demo projects)
if project:
project_id = project.id
is_latest = await parse_helper.check_commit_status(project_id)

if not is_latest or project.status != ProjectStatusEnum.READY.value:
cleanup_graph = True
logger.info(f"Submitting parsing task for existing project {project_id}")
logger.info(
f"Submitting parsing task for existing project {project_id}"
)
process_parsing.delay(
repo_details.model_dump(),
user_id,
user_email,
project_id,
cleanup_graph,
)

await project_manager.update_project_status(project_id, ProjectStatusEnum.SUBMITTED)

await project_manager.update_project_status(
project_id, ProjectStatusEnum.SUBMITTED
)
PostHogClient().send_event(
user_id,
"parsed_repo_event",
Expand All @@ -161,8 +165,11 @@ async def parse_directory(
"project_id": project_id,
},
)
return {"project_id": project_id, "status": ProjectStatusEnum.SUBMITTED.value}

return {
"project_id": project_id,
"status": ProjectStatusEnum.SUBMITTED.value,
}

return {"project_id": project_id, "status": project.status}
else:
# Handle new non-demo projects
Expand Down Expand Up @@ -195,9 +202,9 @@ async def handle_new_project(
}

logger.info(f"Submitting parsing task for new project {new_project_id}")

repo_name = repo_details.repo_name or repo_details.repo_path.split("/")[-1]
await project_manager.register_project(
repo_details.repo_name,
repo_name,
repo_details.branch_name,
user_id,
new_project_id,
Expand Down
Loading
Loading