Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backend/.env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
DJANGO_ALGOLIA_APPLICATION_ID=None
DJANGO_ALGOLIA_EXCLUDED_LOCAL_INDEX_NAMES=None
DJANGO_ALGOLIA_WRITE_API_KEY=None
DJANGO_TYPESENSE_API_KEY=nest_typesense_dev
DJANGO_TYPESENSE_HOST=nest-typesense
DJANGO_TYPESENSE_PORT=8108
DJANGO_ALLOWED_HOSTS=*
DJANGO_AWS_ACCESS_KEY_ID=None
DJANGO_AWS_SECRET_ACCESS_KEY=None
Expand Down
9 changes: 9 additions & 0 deletions backend/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ enrich-data: \
owasp-enrich-events \
owasp-enrich-projects

load-index:
@echo "Indexing Nest data"
@CMD="poetry run python manage.py load_index" $(MAKE) exec-backend-command

populate-index:
@echo "Populating Nest data"
@CMD="python manage.py populate_data" $(MAKE) exec-backend-command


generate-sitemap:
@CMD="python manage.py generate_sitemap" $(MAKE) exec-backend-command

Expand Down
30 changes: 30 additions & 0 deletions backend/apps/common/management/commands/load_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""A command to load OWASP NEST Index."""

import logging

from django.core.management.base import BaseCommand

from apps.common.typesense import REGISTERED_INDEXES

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


class Command(BaseCommand):
help = "Create Typesense indexes"

def handle(self, *args, **kwargs):
logging.info("Starting Typesense index creation...")
if not REGISTERED_INDEXES:
logging.info("No registered indexes found.")
else:
logging.info("Registered indexes: %s", list(REGISTERED_INDEXES.keys()))
for index in REGISTERED_INDEXES.values():
index.create_collection()
try:
index.create_collection()
except Exception:
logging.exception(
"Failed to create collection for index: %s", index.__class__.__name__
)

logging.info("Typesense index creation complete!")
20 changes: 20 additions & 0 deletions backend/apps/common/management/commands/populate_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""A command to populate all Typesense indexes with database data."""

import typesense
from django.core.management.base import BaseCommand

from apps.common.typesense import REGISTERED_INDEXES


class Command(BaseCommand):
help = "Populate all Typesense indexes with database data"

def handle(self, *args, **kwargs):
for index_name, index_instance in REGISTERED_INDEXES.items():
self.stdout.write(f"Populating '{index_name}'...")

try:
index_instance.populate_collection()
self.stdout.write(self.style.SUCCESS(f"Successfully populated '{index_name}'"))
except typesense.exceptions.TypesenseClientError as e:
self.stdout.write(self.style.ERROR(f"Failed to populate '{index_name}': {e}"))
113 changes: 113 additions & 0 deletions backend/apps/common/typesense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Typesense client configuration and schema definition."""

import logging

import typesense
from django.apps import apps
from django.conf import settings

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Registry for indexes
REGISTERED_INDEXES = {}


def register(model_name):
"""Register a model schema."""

def wrapper(cls):
instance = cls()
if not hasattr(instance, "index_name") or not hasattr(instance, "schema"):
message = f"{cls.__name__} must have 'index_name' and 'schema' attributes."
raise AttributeError(message)
REGISTERED_INDEXES[model_name] = instance
logging.info("Registered index: %s", model_name)
return cls

return wrapper


class Typesense:
"""Typesense client manager."""

@staticmethod
def get_client():
"""Return an instance of Typesense client."""
return typesense.Client(
{
"api_key": settings.TYPESENSE_API_KEY,
"nodes": [
{
"host": settings.TYPESENSE_HOST,
"port": settings.TYPESENSE_PORT,
"protocol": "http",
}
],
"connection_timeout_seconds": 5,
}
)


class IndexBase:
"""Base class for Typesense indexes."""

index_name = None
schema = {}

def get_model(self):
"""Retrieve the Django model associated with the index name."""
for app_config in apps.get_app_configs():
try:
if self.index_name == "user":
model = apps.get_model("github", "User")
else:
model = app_config.get_model(self.index_name)

if model:
return model

except LookupError:
continue
raise ValueError(self.index_name)

def create_collection(self):
"""Create collection if it doesn't exist."""
client = Typesense.get_client()
try:
try:
client.collections[self.index_name].delete()
except typesense.exceptions.TypesenseClientError as e:
logging.info("%s", e)
client.collections.create(self.schema)
logging.info("Created collection: %s", self.index_name)
except typesense.exceptions.TypesenseClientError:
logging.exception("Error while creating collection %s", self.index_name)

def populate_collection(self):
"""Populate Typesense collection with data from the database."""
client = Typesense.get_client()
model = self.get_model()
queryset = model.objects.filter().iterator()

data = (self.prepare_document(obj) for obj in queryset if obj.is_indexable)

if not data:
logging.info("No data found for {self.index_name}. Skipping... .")
return
Comment on lines +95 to +96
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix string formatting in logging statement.

The logging statement uses curly braces for a variable, but it's not formatted as an f-string.

-            logging.info("No data found for {self.index_name}. Skipping... .")
+            logging.info(f"No data found for {self.index_name}. Skipping...")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
logging.info("No data found for {self.index_name}. Skipping... .")
return
logging.info(f"No data found for {self.index_name}. Skipping...")
return

Comment on lines +92 to +96
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix logic issue with generator evaluation.

The current code creates a generator with data = (self.prepare_document(obj) for obj in queryset if obj.is_indexable) but then immediately checks if not data:. This condition will always be false because generators are always truthy, even when empty. This means the "No data found" message will never be logged.

-        data = (self.prepare_document(obj) for obj in queryset if obj.is_indexable)
-
-        if not data:
-            logging.info("No data found for {self.index_name}. Skipping... .")
-            return
+        # Check if there's data before creating the generator
+        first_obj = next(queryset, None)
+        if not first_obj or not first_obj.is_indexable:
+            logging.info(f"No data found for {self.index_name}. Skipping...")
+            return
+            
+        # Process the first object and create generator for the rest
+        data = [self.prepare_document(first_obj)]
+        data.extend(self.prepare_document(obj) for obj in queryset if obj.is_indexable)

Committable suggestion skipped: line range outside the PR's diff.


try:
response = client.collections[self.index_name].documents.import_(
data, {"action": "upsert"}
)

errors = [item["error"] for item in response if "error" in item]
if errors:
logging.info("Errors while populating '%s': %s", self.index_name, errors)
logging.info("Populated '%s'", self.index_name)
except typesense.exceptions.TypesenseClientError:
logging.exception("Error while populating '%s'", self.index_name)

def prepare_document(self, obj):
"""Convert model instance to a dictionary for Typesense."""
message = "Subclasses must implement prepare_document()"
raise NotImplementedError(message)
16 changes: 16 additions & 0 deletions backend/apps/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import re
from datetime import datetime, timezone
from functools import lru_cache

import requests
from django.conf import settings
from django.template.defaultfilters import pluralize
from django.utils.text import Truncator
Expand All @@ -15,6 +17,20 @@ def get_absolute_url(path):
return f"{settings.SITE_URL}/{path}"


@lru_cache(maxsize=1024)
def get_geolocation(ip_address):
"""Fetch latitude and longitude from an IP address."""
url = f"https://ipinfo.io/{ip_address}/json"
try:
response = requests.get(url, timeout=5)
data = response.json()
if "loc" in data:
lat, lng = data["loc"].split(",")
return float(lat), float(lng)
except (KeyError, ValueError):
return None, None

Comment on lines +20 to +32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Consider adding API key for ipinfo.io service.

The get_geolocation function appears to work without API key authentication for ipinfo.io. Be aware that free tier usage of ipinfo.io has rate limits, which could lead to blocking or degraded performance in production.

The function is appropriately cached using lru_cache with a reasonable size, and has good error handling, returning (None, None) when the API call fails.

Consider using an API key and adding it to your environment variables:

- url = f"https://ipinfo.io/{ip_address}/json"
+ api_key = settings.IPINFO_API_KEY
+ url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"

🌐 Web query:

Does ipinfo.io require an API key or token for production use?

💡 Result:

Yes, ipinfo.io requires an API token for production use. Here’s a detailed breakdown:

Authentication Requirements

  • API Token is mandatory: All API requests require a token for authentication, which can be provided via HTTP Basic Auth, Bearer token headers, or URL parameters[1][2][13].
  • Free tier limitations: The free plan includes 50,000 monthly requests and lacks advanced features like company or carrier data[6][13]. Paid plans (Basic, Standard, Business, Enterprise) unlock higher limits and additional data fields[6][13].

Usage Guidelines

  • Token security: Tokens should not be exposed on the frontend due to security risks. Backend integration is recommended for production use[3][7].
  • Rate limits: Unauthenticated requests are severely limited, while authenticated requests respect the plan’s quota (e.g., 50,000/month for free tiers)[6][9].

Integration Examples

  • Frontend: Use HTTPS with tokens in URL parameters for limited client-side requests (e.g., fetch("https://ipinfo.io/json?token=$TOKEN"))[2][4].
  • Backend: Official libraries (e.g., Go, Python) simplify authenticated requests and bulk lookups[5][10].

For production-grade applications requiring reliability and scalability, upgrading to a paid plan is advised to avoid throttling and access advanced features[8][16].

Citations:


Production API Authentication Needed for ipinfo.io

The current implementation of get_geolocation works functionally, but since ipinfo.io requires an API token for production use, you should update the URL construction to include the token. This will help prevent hitting free tier rate limits and ensure stable operation in a production environment.

Recommended changes:

  • Retrieve your API token from environment variables (e.g., via settings.IPINFO_API_KEY).
  • Modify the URL to pass the token as a query parameter, for example:
- url = f"https://ipinfo.io/{ip_address}/json"
+ api_key = settings.IPINFO_API_KEY
+ url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"

The use of lru_cache and error handling remain solid. Please adjust the implementation accordingly.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
@lru_cache(maxsize=1024)
def get_geolocation(ip_address):
"""Fetch latitude and longitude from an IP address."""
url = f"https://ipinfo.io/{ip_address}/json"
try:
response = requests.get(url, timeout=5)
data = response.json()
if "loc" in data:
lat, lng = data["loc"].split(",")
return float(lat), float(lng)
except (KeyError, ValueError):
return None, None
@lru_cache(maxsize=1024)
def get_geolocation(ip_address):
"""Fetch latitude and longitude from an IP address."""
api_key = settings.IPINFO_API_KEY
url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"
try:
response = requests.get(url, timeout=5)
data = response.json()
if "loc" in data:
lat, lng = data["loc"].split(",")
return float(lat), float(lng)
except (KeyError, ValueError):
return None, None


def get_nest_user_agent():
"""Return Nest user agent."""
return settings.APP_NAME.replace(" ", "-").lower()
Expand Down
90 changes: 90 additions & 0 deletions backend/apps/core/api/typesense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Typesense search API endpoint."""

import json

import typesense
from django.core.cache import cache
from django.http import JsonResponse
from django.views.decorators.http import require_POST

from apps.common.typesense import Typesense
from apps.common.utils import get_geolocation, get_user_ip_address
from apps.core.utils.params_mapping_typesense import get_typesense_params_for_index

CACHE_PREFIX = "typesense_proxy"
CACHE_TTL_IN_SECONDS = 3600 # 1 hour


def get_typesense_search_results(
index_name, query, page, hits_per_page, sort_by=None, ip_address=None
):
"""Return search results for the given parameters and index."""
search_parameters = get_typesense_params_for_index(index_name)

search_parameters.update(
{
"q": query,
"page": page,
"per_page": hits_per_page,
}
)

if sort_by:
search_parameters["sort_by"] = sort_by

if index_name == "chapter" and ip_address:
user_lat, user_lng = get_geolocation(ip_address)
if user_lat and user_lng:
search_parameters["sort_by"] = f"_geoloc({user_lat},{user_lng}):asc,updated_at:desc"

client = Typesense.get_client()
search_result = client.collections[index_name].documents.search(search_parameters)
documents = [doc["document"] for doc in search_result.get("hits", [])]

return {
"hits": documents,
"nbPages": (search_result.get("found", 0) + hits_per_page - 1) // hits_per_page,
"totalHits": search_result.get("found", 0),
}


@require_POST
def typesense_search(request):
"""Perform a generic Typesense search API endpoint."""
try:
data = json.loads(request.body)

index_name = data.get("indexName")
hits_per_page = min(int(data.get("hitsPerPage", 25)), 250)
page = int(data.get("page", 1))
query = data.get("query", "")
sort_by = data.get("sortBy", "")

ip_address = get_user_ip_address(request=request)

cache_key = f"{CACHE_PREFIX}:{index_name}:{query}:{page}:{hits_per_page}:{sort_by}"
if "chapters" in index_name:
cache_key = f"{cache_key}:{ip_address}"

result = cache.get(cache_key)
if result is not None:
return JsonResponse(result)

result = get_typesense_search_results(
index_name,
query,
page,
hits_per_page,
sort_by=sort_by,
ip_address=ip_address,
)

cache.set(cache_key, result, CACHE_TTL_IN_SECONDS)

return JsonResponse(result)

except (json.JSONDecodeError, typesense.exceptions.TypesenseClientError):
return JsonResponse(
{"error": "An internal error occurred. Please try again later."},
status=500,
)
68 changes: 68 additions & 0 deletions backend/apps/core/utils/params_mapping_typesense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Typesense search parameters based on the index name."""


def get_typesense_params_for_index(index_name):
"""Return Typesense search parameters based on the index name."""
# Common attributes
params = {
"num_typos": 2,
"prioritize_exact_match": True,
"highlight_full_fields": "", # Equivalent to attributesToHighlight: []
"drop_tokens_threshold": 1, # Similar to removeWordsIfNoResults: allOptional
}

match index_name:
case "issue":
# searchable attributes
params["query_by"] = (
"title,project_name,repository_name,labels,repository_languages,project_description,repository_description,project_tags,repository_topics,author_login,author_name,summary,project_level"
)
# weights of searchable attributes
params["query_by_weights"] = "7,7,7,6,6,5,5,4,4,3,3,2,1"
# equivalent to attributesToRetrieve
params["include_fields"] = (
"comments_count,created_at,hint,labels,project_name,project_url,repository_languages,summary,title,updated_at,url"
)
# sort by fields
params["sort_by"] = "created_at:desc,comments_count:desc,repository_stars_count:desc"

case "chapter":
params["query_by"] = (
"name,leaders,top_contributors.login,top_contributors.name,suggested_location,country,region,postal_code,tags"
)
params["query_by_weights"] = "10,5,4,4,2,2,2,2,1"
params["include_fields"] = (
"_geoloc,created_at,is_active,key,leaders,name,region,related_urls,suggested_location,summary,tags,top_contributors,updated_at,url"
)
params["sort_by"] = "created_at:asc,updated_at:desc"

case "project":
params["query_by"] = (
"name,repositories.description,repositories.name,custom_tags,languages,tags,topics,description,companies,organizations,leaders,top_contributors.login,top_contributors.name,level"
)
params["query_by_weights"] = "8,7,7,6,6,6,6,5,4,4,3,2,2,1"
params["include_fields"] = (
"contributors_count,forks_count,is_active,issues_count,key,languages,leaders,level,name,organizations,repositories_count,stars_count,summary,top_contributors,topics,type,updated_at,url"
)
params["sort_by"] = "level_raw:desc,stars_count:desc,updated_at:desc"

Comment on lines +39 to +48
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Ensure project sorting logic is consistent with requirements.

The project index sorting is configured as:

params["sort_by"] = "level_raw:desc,stars_count:desc,updated_at:desc"

Verify this matches the expected sorting behavior for projects in the UI. Also, note that "level_raw" is used for sorting while "level" is used in query_by - ensure this is intentional.


🏁 Script executed:

#!/bin/bash
# Check for references to level_raw field in the project schema

echo "Checking for level_raw in project schema:"
fd "schema.*level_raw" --exec grep -A 10 -B 10 "level_raw" {}

Length of output: 156


Action Required: Update Sorting Field in Project Query

The executed script confirms that there is no reference to the level_raw field in the project schema. Since the query_by parameter uses "level", using "level_raw" in the sort_by parameter is likely inconsistent with the intended functionality. Please verify whether this discrepancy was intentional. If not, update the sorting field to match the schema (e.g., change level_raw:desc to level:desc), or ensure that the project schema is updated to include the level_raw field.

  • Location: backend/apps/core/utils/params_mapping_typesense.py (around line 44)
  • Diff Suggestion:
    - params["sort_by"] = "level_raw:desc,stars_count:desc,updated_at:desc"
    + params["sort_by"] = "level:desc,stars_count:desc,updated_at:desc"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
case "project":
params["query_by"] = (
"name,repositories.description,repositories.name,custom_tags,languages,tags,topics,description,companies,organizations,leaders,top_contributors.login,top_contributors.name,level"
)
params["query_by_weights"] = "8,7,7,6,6,6,6,5,4,4,3,2,2,1"
params["include_fields"] = (
"contributors_count,forks_count,is_active,issues_count,key,languages,leaders,level,name,organizations,repositories_count,stars_count,summary,top_contributors,topics,type,updated_at,url"
)
params["sort_by"] = "level_raw:desc,stars_count:desc,updated_at:desc"
case "project":
params["query_by"] = (
"name,repositories.description,repositories.name,custom_tags,languages,tags,topics,description,companies,organizations,leaders,top_contributors.login,top_contributors.name,level"
)
params["query_by_weights"] = "8,7,7,6,6,6,6,5,4,4,3,2,2,1"
params["include_fields"] = (
"contributors_count,forks_count,is_active,issues_count,key,languages,leaders,level,name,organizations,repositories_count,stars_count,summary,top_contributors,topics,type,updated_at,url"
)
params["sort_by"] = "level:desc,stars_count:desc,updated_at:desc"

case "committee":
params["query_by"] = "name,leaders,top_contributors.login,top_contributors.name,tags"
params["query_by_weights"] = "4,3,2,2,1"
params["include_fields"] = (
"created_at,key,leaders,name,related_urls,summary,top_contributors,updated_at,url"
)
params["sort_by"] = "name:asc,created_at:asc,updated_at:desc"

case "user":
params["query_by"] = "email,login,name,company,location,bio"
params["query_by_weights"] = "3,3,3,2,2,1"
params["include_fields"] = (
"avatar_url,bio,company,created_at,email,followers_count,following_count,key,location,login,name,public_repositories_count,title,updated_at,url"
)
params["sort_by"] = "max_contributions_count:desc,created_at:desc,followers_count:desc"

case _:
params["query_by"] = "_all"

return params
Loading