-
-
Couldn't load subscription status.
- Fork 245
Migration from algolia to typesense #1055
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9ac96d6
f7a9da4
c7d0fd1
e683961
ac51df1
188ab52
2c071c2
1d97ace
577faf8
85d6c2a
554ac8a
ed1db74
e040d41
69f323b
c04d682
1ff17f3
8e5ba72
a4b8a18
2b6c757
a69bc1b
3c52ba1
531124f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| """A command to load OWASP NEST Index.""" | ||
|
|
||
| import logging | ||
|
|
||
| from django.core.management.base import BaseCommand | ||
|
|
||
| from apps.common.typesense import REGISTERED_INDEXES | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| help = "Create Typesense indexes" | ||
|
|
||
| def handle(self, *args, **kwargs): | ||
| logging.info("Starting Typesense index creation...") | ||
| if not REGISTERED_INDEXES: | ||
| logging.info("No registered indexes found.") | ||
| else: | ||
| logging.info("Registered indexes: %s", list(REGISTERED_INDEXES.keys())) | ||
| for index in REGISTERED_INDEXES.values(): | ||
| index.create_collection() | ||
| try: | ||
| index.create_collection() | ||
| except Exception: | ||
| logging.exception( | ||
| "Failed to create collection for index: %s", index.__class__.__name__ | ||
| ) | ||
|
|
||
| logging.info("Typesense index creation complete!") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| """A command to populate all Typesense indexes with database data.""" | ||
|
|
||
| import typesense | ||
| from django.core.management.base import BaseCommand | ||
|
|
||
| from apps.common.typesense import REGISTERED_INDEXES | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| help = "Populate all Typesense indexes with database data" | ||
|
|
||
| def handle(self, *args, **kwargs): | ||
| for index_name, index_instance in REGISTERED_INDEXES.items(): | ||
| self.stdout.write(f"Populating '{index_name}'...") | ||
|
|
||
| try: | ||
| index_instance.populate_collection() | ||
| self.stdout.write(self.style.SUCCESS(f"Successfully populated '{index_name}'")) | ||
| except typesense.exceptions.TypesenseClientError as e: | ||
| self.stdout.write(self.style.ERROR(f"Failed to populate '{index_name}': {e}")) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| """Typesense client configuration and schema definition.""" | ||
|
|
||
| import logging | ||
|
|
||
| import typesense | ||
| from django.apps import apps | ||
| from django.conf import settings | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | ||
|
|
||
| # Registry for indexes | ||
| REGISTERED_INDEXES = {} | ||
|
|
||
|
|
||
| def register(model_name): | ||
| """Register a model schema.""" | ||
|
|
||
| def wrapper(cls): | ||
| instance = cls() | ||
| if not hasattr(instance, "index_name") or not hasattr(instance, "schema"): | ||
| message = f"{cls.__name__} must have 'index_name' and 'schema' attributes." | ||
| raise AttributeError(message) | ||
| REGISTERED_INDEXES[model_name] = instance | ||
| logging.info("Registered index: %s", model_name) | ||
| return cls | ||
|
|
||
| return wrapper | ||
|
|
||
|
|
||
| class Typesense: | ||
| """Typesense client manager.""" | ||
|
|
||
| @staticmethod | ||
| def get_client(): | ||
| """Return an instance of Typesense client.""" | ||
| return typesense.Client( | ||
| { | ||
| "api_key": settings.TYPESENSE_API_KEY, | ||
| "nodes": [ | ||
| { | ||
| "host": settings.TYPESENSE_HOST, | ||
| "port": settings.TYPESENSE_PORT, | ||
| "protocol": "http", | ||
| } | ||
| ], | ||
| "connection_timeout_seconds": 5, | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| class IndexBase: | ||
| """Base class for Typesense indexes.""" | ||
|
|
||
| index_name = None | ||
| schema = {} | ||
|
|
||
| def get_model(self): | ||
| """Retrieve the Django model associated with the index name.""" | ||
| for app_config in apps.get_app_configs(): | ||
| try: | ||
| if self.index_name == "user": | ||
| model = apps.get_model("github", "User") | ||
| else: | ||
| model = app_config.get_model(self.index_name) | ||
|
|
||
| if model: | ||
| return model | ||
|
|
||
| except LookupError: | ||
| continue | ||
| raise ValueError(self.index_name) | ||
|
|
||
| def create_collection(self): | ||
| """Create collection if it doesn't exist.""" | ||
| client = Typesense.get_client() | ||
| try: | ||
| try: | ||
| client.collections[self.index_name].delete() | ||
| except typesense.exceptions.TypesenseClientError as e: | ||
| logging.info("%s", e) | ||
| client.collections.create(self.schema) | ||
| logging.info("Created collection: %s", self.index_name) | ||
| except typesense.exceptions.TypesenseClientError: | ||
| logging.exception("Error while creating collection %s", self.index_name) | ||
|
|
||
| def populate_collection(self): | ||
| """Populate Typesense collection with data from the database.""" | ||
| client = Typesense.get_client() | ||
| model = self.get_model() | ||
| queryset = model.objects.filter().iterator() | ||
|
|
||
| data = (self.prepare_document(obj) for obj in queryset if obj.is_indexable) | ||
|
|
||
| if not data: | ||
| logging.info("No data found for {self.index_name}. Skipping... .") | ||
| return | ||
|
Comment on lines
+92
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix logic issue with generator evaluation. The current code creates a generator with - data = (self.prepare_document(obj) for obj in queryset if obj.is_indexable)
-
- if not data:
- logging.info("No data found for {self.index_name}. Skipping... .")
- return
+ # Check if there's data before creating the generator
+ first_obj = next(queryset, None)
+ if not first_obj or not first_obj.is_indexable:
+ logging.info(f"No data found for {self.index_name}. Skipping...")
+ return
+
+ # Process the first object and create generator for the rest
+ data = [self.prepare_document(first_obj)]
+ data.extend(self.prepare_document(obj) for obj in queryset if obj.is_indexable)
|
||
|
|
||
| try: | ||
| response = client.collections[self.index_name].documents.import_( | ||
| data, {"action": "upsert"} | ||
| ) | ||
|
|
||
| errors = [item["error"] for item in response if "error" in item] | ||
| if errors: | ||
| logging.info("Errors while populating '%s': %s", self.index_name, errors) | ||
| logging.info("Populated '%s'", self.index_name) | ||
| except typesense.exceptions.TypesenseClientError: | ||
| logging.exception("Error while populating '%s'", self.index_name) | ||
|
|
||
| def prepare_document(self, obj): | ||
| """Convert model instance to a dictionary for Typesense.""" | ||
| message = "Subclasses must implement prepare_document()" | ||
| raise NotImplementedError(message) | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -2,7 +2,9 @@ | |||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
| import re | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| from datetime import datetime, timezone | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| from functools import lru_cache | ||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
| import requests | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| from django.conf import settings | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| from django.template.defaultfilters import pluralize | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| from django.utils.text import Truncator | ||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -15,6 +17,20 @@ def get_absolute_url(path): | |||||||||||||||||||||||||||||||||||||||||||||||||||
| return f"{settings.SITE_URL}/{path}" | ||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
| @lru_cache(maxsize=1024) | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| def get_geolocation(ip_address): | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| """Fetch latitude and longitude from an IP address.""" | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| url = f"https://ipinfo.io/{ip_address}/json" | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| try: | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| response = requests.get(url, timeout=5) | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| data = response.json() | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| if "loc" in data: | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| lat, lng = data["loc"].split(",") | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| return float(lat), float(lng) | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| except (KeyError, ValueError): | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| return None, None | ||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+20
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainConsider adding API key for ipinfo.io service. The The function is appropriately cached using Consider using an API key and adding it to your environment variables: - url = f"https://ipinfo.io/{ip_address}/json"
+ api_key = settings.IPINFO_API_KEY
+ url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"🌐 Web query: 💡 Result: Yes, ipinfo.io requires an API token for production use. Here’s a detailed breakdown: Authentication Requirements
Usage Guidelines
Integration Examples
For production-grade applications requiring reliability and scalability, upgrading to a paid plan is advised to avoid throttling and access advanced features[8][16]. Citations:
Production API Authentication Needed for ipinfo.io The current implementation of Recommended changes:
- url = f"https://ipinfo.io/{ip_address}/json"
+ api_key = settings.IPINFO_API_KEY
+ url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"The use of 📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
| def get_nest_user_agent(): | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| """Return Nest user agent.""" | ||||||||||||||||||||||||||||||||||||||||||||||||||||
| return settings.APP_NAME.replace(" ", "-").lower() | ||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| """Typesense search API endpoint.""" | ||
|
|
||
| import json | ||
|
|
||
| import typesense | ||
| from django.core.cache import cache | ||
| from django.http import JsonResponse | ||
| from django.views.decorators.http import require_POST | ||
|
|
||
| from apps.common.typesense import Typesense | ||
| from apps.common.utils import get_geolocation, get_user_ip_address | ||
| from apps.core.utils.params_mapping_typesense import get_typesense_params_for_index | ||
|
|
||
| CACHE_PREFIX = "typesense_proxy" | ||
| CACHE_TTL_IN_SECONDS = 3600 # 1 hour | ||
|
|
||
|
|
||
| def get_typesense_search_results( | ||
| index_name, query, page, hits_per_page, sort_by=None, ip_address=None | ||
| ): | ||
| """Return search results for the given parameters and index.""" | ||
| search_parameters = get_typesense_params_for_index(index_name) | ||
|
|
||
| search_parameters.update( | ||
| { | ||
| "q": query, | ||
| "page": page, | ||
| "per_page": hits_per_page, | ||
| } | ||
| ) | ||
|
|
||
| if sort_by: | ||
| search_parameters["sort_by"] = sort_by | ||
|
|
||
| if index_name == "chapter" and ip_address: | ||
| user_lat, user_lng = get_geolocation(ip_address) | ||
| if user_lat and user_lng: | ||
| search_parameters["sort_by"] = f"_geoloc({user_lat},{user_lng}):asc,updated_at:desc" | ||
|
|
||
| client = Typesense.get_client() | ||
| search_result = client.collections[index_name].documents.search(search_parameters) | ||
| documents = [doc["document"] for doc in search_result.get("hits", [])] | ||
|
|
||
| return { | ||
| "hits": documents, | ||
| "nbPages": (search_result.get("found", 0) + hits_per_page - 1) // hits_per_page, | ||
| "totalHits": search_result.get("found", 0), | ||
| } | ||
|
|
||
|
|
||
| @require_POST | ||
| def typesense_search(request): | ||
| """Perform a generic Typesense search API endpoint.""" | ||
| try: | ||
| data = json.loads(request.body) | ||
|
|
||
| index_name = data.get("indexName") | ||
| hits_per_page = min(int(data.get("hitsPerPage", 25)), 250) | ||
| page = int(data.get("page", 1)) | ||
| query = data.get("query", "") | ||
| sort_by = data.get("sortBy", "") | ||
|
|
||
| ip_address = get_user_ip_address(request=request) | ||
|
|
||
| cache_key = f"{CACHE_PREFIX}:{index_name}:{query}:{page}:{hits_per_page}:{sort_by}" | ||
| if "chapters" in index_name: | ||
| cache_key = f"{cache_key}:{ip_address}" | ||
|
|
||
| result = cache.get(cache_key) | ||
| if result is not None: | ||
| return JsonResponse(result) | ||
|
|
||
| result = get_typesense_search_results( | ||
| index_name, | ||
| query, | ||
| page, | ||
| hits_per_page, | ||
| sort_by=sort_by, | ||
| ip_address=ip_address, | ||
| ) | ||
|
|
||
| cache.set(cache_key, result, CACHE_TTL_IN_SECONDS) | ||
|
|
||
| return JsonResponse(result) | ||
|
|
||
| except (json.JSONDecodeError, typesense.exceptions.TypesenseClientError): | ||
| return JsonResponse( | ||
| {"error": "An internal error occurred. Please try again later."}, | ||
| status=500, | ||
| ) |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,68 @@ | ||||||||||||||||||||||||||||||||||||||
| """Typesense search parameters based on the index name.""" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| def get_typesense_params_for_index(index_name): | ||||||||||||||||||||||||||||||||||||||
| """Return Typesense search parameters based on the index name.""" | ||||||||||||||||||||||||||||||||||||||
| # Common attributes | ||||||||||||||||||||||||||||||||||||||
| params = { | ||||||||||||||||||||||||||||||||||||||
| "num_typos": 2, | ||||||||||||||||||||||||||||||||||||||
| "prioritize_exact_match": True, | ||||||||||||||||||||||||||||||||||||||
| "highlight_full_fields": "", # Equivalent to attributesToHighlight: [] | ||||||||||||||||||||||||||||||||||||||
| "drop_tokens_threshold": 1, # Similar to removeWordsIfNoResults: allOptional | ||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| match index_name: | ||||||||||||||||||||||||||||||||||||||
| case "issue": | ||||||||||||||||||||||||||||||||||||||
| # searchable attributes | ||||||||||||||||||||||||||||||||||||||
| params["query_by"] = ( | ||||||||||||||||||||||||||||||||||||||
| "title,project_name,repository_name,labels,repository_languages,project_description,repository_description,project_tags,repository_topics,author_login,author_name,summary,project_level" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| # weights of searchable attributes | ||||||||||||||||||||||||||||||||||||||
| params["query_by_weights"] = "7,7,7,6,6,5,5,4,4,3,3,2,1" | ||||||||||||||||||||||||||||||||||||||
| # equivalent to attributesToRetrieve | ||||||||||||||||||||||||||||||||||||||
| params["include_fields"] = ( | ||||||||||||||||||||||||||||||||||||||
| "comments_count,created_at,hint,labels,project_name,project_url,repository_languages,summary,title,updated_at,url" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| # sort by fields | ||||||||||||||||||||||||||||||||||||||
| params["sort_by"] = "created_at:desc,comments_count:desc,repository_stars_count:desc" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| case "chapter": | ||||||||||||||||||||||||||||||||||||||
| params["query_by"] = ( | ||||||||||||||||||||||||||||||||||||||
| "name,leaders,top_contributors.login,top_contributors.name,suggested_location,country,region,postal_code,tags" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| params["query_by_weights"] = "10,5,4,4,2,2,2,2,1" | ||||||||||||||||||||||||||||||||||||||
| params["include_fields"] = ( | ||||||||||||||||||||||||||||||||||||||
| "_geoloc,created_at,is_active,key,leaders,name,region,related_urls,suggested_location,summary,tags,top_contributors,updated_at,url" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| params["sort_by"] = "created_at:asc,updated_at:desc" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| case "project": | ||||||||||||||||||||||||||||||||||||||
| params["query_by"] = ( | ||||||||||||||||||||||||||||||||||||||
| "name,repositories.description,repositories.name,custom_tags,languages,tags,topics,description,companies,organizations,leaders,top_contributors.login,top_contributors.name,level" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| params["query_by_weights"] = "8,7,7,6,6,6,6,5,4,4,3,2,2,1" | ||||||||||||||||||||||||||||||||||||||
| params["include_fields"] = ( | ||||||||||||||||||||||||||||||||||||||
| "contributors_count,forks_count,is_active,issues_count,key,languages,leaders,level,name,organizations,repositories_count,stars_count,summary,top_contributors,topics,type,updated_at,url" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| params["sort_by"] = "level_raw:desc,stars_count:desc,updated_at:desc" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+39
to
+48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainEnsure project sorting logic is consistent with requirements. The project index sorting is configured as: params["sort_by"] = "level_raw:desc,stars_count:desc,updated_at:desc"Verify this matches the expected sorting behavior for projects in the UI. Also, note that "level_raw" is used for sorting while "level" is used in query_by - ensure this is intentional. 🏁 Script executed: #!/bin/bash
# Check for references to level_raw field in the project schema
echo "Checking for level_raw in project schema:"
fd "schema.*level_raw" --exec grep -A 10 -B 10 "level_raw" {}Length of output: 156 Action Required: Update Sorting Field in Project Query The executed script confirms that there is no reference to the
📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||||||||||||||
| case "committee": | ||||||||||||||||||||||||||||||||||||||
| params["query_by"] = "name,leaders,top_contributors.login,top_contributors.name,tags" | ||||||||||||||||||||||||||||||||||||||
| params["query_by_weights"] = "4,3,2,2,1" | ||||||||||||||||||||||||||||||||||||||
| params["include_fields"] = ( | ||||||||||||||||||||||||||||||||||||||
| "created_at,key,leaders,name,related_urls,summary,top_contributors,updated_at,url" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| params["sort_by"] = "name:asc,created_at:asc,updated_at:desc" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| case "user": | ||||||||||||||||||||||||||||||||||||||
| params["query_by"] = "email,login,name,company,location,bio" | ||||||||||||||||||||||||||||||||||||||
| params["query_by_weights"] = "3,3,3,2,2,1" | ||||||||||||||||||||||||||||||||||||||
| params["include_fields"] = ( | ||||||||||||||||||||||||||||||||||||||
| "avatar_url,bio,company,created_at,email,followers_count,following_count,key,location,login,name,public_repositories_count,title,updated_at,url" | ||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||
| params["sort_by"] = "max_contributions_count:desc,created_at:desc,followers_count:desc" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| case _: | ||||||||||||||||||||||||||||||||||||||
| params["query_by"] = "_all" | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| return params | ||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix string formatting in logging statement.
The logging statement uses curly braces for a variable, but it's not formatted as an f-string.
📝 Committable suggestion