Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ SLACK_CHANNEL=
WRGL_CLIENT_ID=
WRGL_CLIENT_SECRET=
IPNO_API_KEY=

# OpenAI settings
OPENAI_API_KEY=
OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD=0.7
2 changes: 1 addition & 1 deletion ipno/config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,4 +303,4 @@

CSV_DATA_PATH = "./ipno/csv_data"

IPNO_API_KEY = env.str("IPNO_API_KEY")
IPNO_API_KEY = env.str("IPNO_API_KEY", None)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add OpenAI configuration settings.

Based on the PR objectives to integrate OpenAI's API with batch mode processing, consider adding the following settings:

 IPNO_API_KEY = env.str("IPNO_API_KEY", None)
+
+# OpenAI Configuration
+OPENAI_API_KEY = env.str("OPENAI_API_KEY", None)
+OPENAI_BATCH_MODE = env.bool("OPENAI_BATCH_MODE", False)  # 50% discount with 24h turnaround
+OPENAI_MODEL_NAME = env.str("OPENAI_MODEL_NAME", "gpt-4")  # or your preferred model
+OPENAI_MAX_TOKENS = env.int("OPENAI_MAX_TOKENS", 4096)
+OPENAI_TEMPERATURE = env.float("OPENAI_TEMPERATURE", 0.0)  # 0.0 for deterministic outputs
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
IPNO_API_KEY = env.str("IPNO_API_KEY", None)
IPNO_API_KEY = env.str("IPNO_API_KEY", None)
# OpenAI Configuration
OPENAI_API_KEY = env.str("OPENAI_API_KEY", None)
OPENAI_BATCH_MODE = env.bool("OPENAI_BATCH_MODE", False) # 50% discount with 24h turnaround
OPENAI_MODEL_NAME = env.str("OPENAI_MODEL_NAME", "gpt-4") # or your preferred model
OPENAI_MAX_TOKENS = env.int("OPENAI_MAX_TOKENS", 4096)
OPENAI_TEMPERATURE = env.float("OPENAI_TEMPERATURE", 0.0) # 0.0 for deterministic outputs

17 changes: 17 additions & 0 deletions ipno/news_articles/management/commands/process_llm_batches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from django.core.management import BaseCommand
from news_articles.services.process_llm_analysis import ProcessLLMAnalysis


class Command(BaseCommand):
help = "Process completed LLM analysis batches"

def handle(self, *args, **options):
processor = ProcessLLMAnalysis()

self.stdout.write("Starting to process completed batches...")
batches = processor.client.batches.list(limit=100)
completed_batches = [b for b in batches.data if b.status == "completed" and b.output_file_id]

self.stdout.write(f"Found {len(completed_batches)} completed batches to process")
processor.process_completed_batches()
self.stdout.write("Finished processing completed batches")
Comment on lines +8 to +17
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling and improve batch processing.

The current implementation has several areas that need attention:

  1. Missing error handling for API calls
  2. The filtered batches are not used in process_completed_batches()
  3. Hard-coded batch limit without pagination
  4. No progress tracking for large batch counts

Consider applying these improvements:

-    def handle(self, *args, **options):
-        processor = ProcessLLMAnalysis()
-        
-        self.stdout.write("Starting to process completed batches...")
-        batches = processor.client.batches.list(limit=100)
-        completed_batches = [b for b in batches.data if b.status == "completed" and b.output_file_id]
-        
-        self.stdout.write(f"Found {len(completed_batches)} completed batches to process")
-        processor.process_completed_batches()
-        self.stdout.write("Finished processing completed batches")
+    def handle(self, *args, **options):
+        try:
+            processor = ProcessLLMAnalysis()
+            self.stdout.write("Starting to process completed batches...")
+            
+            batch_limit = 100  # Consider making this configurable
+            processed_count = 0
+            
+            # Implement pagination to handle more than 100 batches
+            batches = processor.client.batches.list(limit=batch_limit)
+            while batches.data:
+                completed_batches = [
+                    b for b in batches.data 
+                    if b.status == "completed" and b.output_file_id
+                ]
+                
+                if completed_batches:
+                    self.stdout.write(
+                        f"Processing {len(completed_batches)} completed batches..."
+                    )
+                    processor.process_completed_batches(completed_batches)
+                    processed_count += len(completed_batches)
+                
+                # Get next page if available
+                if batches.has_more:
+                    batches = processor.client.batches.list(
+                        limit=batch_limit,
+                        after=batches.last_id
+                    )
+                else:
+                    break
+            
+            self.stdout.write(
+                self.style.SUCCESS(
+                    f"Successfully processed {processed_count} batches"
+                )
+            )
+            
+        except Exception as e:
+            self.stdout.write(
+                self.style.ERROR(f"Error processing batches: {str(e)}")
+            )
+            raise

Committable suggestion was skipped due to low confidence.

11 changes: 11 additions & 0 deletions ipno/news_articles/management/commands/run_llm_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from django.core.management import BaseCommand

from news_articles.services.process_llm_analysis import ProcessLLMAnalysis


class Command(BaseCommand):
help = "Process unanalyzed news articles with OpenAI LLM to detect police misconduct content"

def handle(self, *args, **options):
processor = ProcessLLMAnalysis()
processor.process_unanalyzed_articles()
Comment on lines +6 to +11
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Enhance command robustness and monitoring capabilities.

While the implementation is functional, consider adding the following improvements for production readiness:

  1. Error handling for API failures and rate limits
  2. Progress reporting for long-running operations
  3. Logging for execution tracking
  4. Batch size control
  5. Dry-run option for testing

Here's a suggested implementation with these improvements:

 class Command(BaseCommand):
     help = "Process unanalyzed news articles with OpenAI LLM to detect police misconduct content"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--batch-size',
+            type=int,
+            default=100,
+            help='Number of articles to process in each batch'
+        )
+        parser.add_argument(
+            '--dry-run',
+            action='store_true',
+            help='Run without making actual API calls'
+        )

     def handle(self, *args, **options):
+        try:
+            self.stdout.write(
+                self.style.SUCCESS('Starting LLM analysis of unanalyzed articles...')
+            )
+            
             processor = ProcessLLMAnalysis()
-            processor.process_unanalyzed_articles()
+            processor.process_unanalyzed_articles(
+                batch_size=options['batch_size'],
+                dry_run=options['dry_run'],
+                progress_callback=lambda count: self.stdout.write(
+                    f'Processed {count} articles...'
+                )
+            )
+            
+            self.stdout.write(
+                self.style.SUCCESS('Successfully completed LLM analysis')
+            )
+        except Exception as e:
+            self.stdout.write(
+                self.style.ERROR(f'Error processing articles: {str(e)}')
+            )
+            raise

Committable suggestion was skipped due to low confidence.

13 changes: 13 additions & 0 deletions ipno/news_articles/management/commands/submit_llm_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from django.core.management import BaseCommand
from news_articles.services.process_llm_analysis import ProcessLLMAnalysis


class Command(BaseCommand):
help = "Submit unprocessed news articles for batch LLM analysis"
Comment on lines +1 to +6
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add type hints and enhance command documentation.

Consider improving the command's maintainability and usability:

+from logging import getLogger
 from django.core.management import BaseCommand
-from news_articles.services.process_llm_analysis import ProcessLLMAnalysis
+from news_articles.services.process_llm_analysis import ProcessLLMAnalysis, BatchSubmission
+
+logger = getLogger(__name__)

 class Command(BaseCommand):
-    help = "Submit unprocessed news articles for batch LLM analysis"
+    help = """
+    Submit unprocessed news articles for batch LLM analysis using OpenAI's API.
+    
+    This command:
+    1. Identifies unprocessed news articles in the database
+    2. Submits them to OpenAI's batch processing queue
+    3. Returns batch IDs for tracking
+    
+    Note: Processing may take up to 24 hours in batch mode.
+    """
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from django.core.management import BaseCommand
from news_articles.services.process_llm_analysis import ProcessLLMAnalysis
class Command(BaseCommand):
help = "Submit unprocessed news articles for batch LLM analysis"
from logging import getLogger
from django.core.management import BaseCommand
from news_articles.services.process_llm_analysis import ProcessLLMAnalysis, BatchSubmission
logger = getLogger(__name__)
class Command(BaseCommand):
help = """
Submit unprocessed news articles for batch LLM analysis using OpenAI's API.
This command:
1. Identifies unprocessed news articles in the database
2. Submits them to OpenAI's batch processing queue
3. Returns batch IDs for tracking
Note: Processing may take up to 24 hours in batch mode.
"""


def handle(self, *args, **options):
processor = ProcessLLMAnalysis()
submitted_batches = processor.submit_unanalyzed_articles()
self.stdout.write(f"Submitted {len(submitted_batches)} articles for processing")
for batch_id, article_id in submitted_batches:
self.stdout.write(f"Article {article_id} submitted as batch {batch_id}")
Comment on lines +8 to +13
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Enhance error handling and add production monitoring.

The current implementation lacks error handling, progress tracking, and production monitoring capabilities.

Consider applying these improvements:

-    def handle(self, *args, **options):
-        processor = ProcessLLMAnalysis()
-        submitted_batches = processor.submit_unanalyzed_articles()
-        self.stdout.write(f"Submitted {len(submitted_batches)} articles for processing")
-        for batch_id, article_id in submitted_batches:
-            self.stdout.write(f"Article {article_id} submitted as batch {batch_id}")
+    def handle(self, *args, **options) -> None:
+        try:
+            processor = ProcessLLMAnalysis()
+            
+            # Get count of unprocessed articles
+            total_articles = processor.get_unprocessed_count()
+            if total_articles == 0:
+                self.stdout.write("No unprocessed articles found.")
+                return
+                
+            self.stdout.write(f"Found {total_articles} unprocessed articles")
+            
+            # Process articles with progress tracking
+            submitted_batches: list[BatchSubmission] = []
+            with self.stdout.progress_bar(total_articles) as progress:
+                submitted_batches = processor.submit_unanalyzed_articles(
+                    progress_callback=lambda: progress.update(1)
+                )
+            
+            # Log results
+            logger.info(
+                "Batch submission completed",
+                extra={
+                    "total_submitted": len(submitted_batches),
+                    "batch_ids": [batch.id for batch in submitted_batches]
+                }
+            )
+            
+            # Output results
+            self.stdout.write(
+                self.style.SUCCESS(f"Successfully submitted {len(submitted_batches)} articles")
+            )
+            for batch in submitted_batches:
+                self.stdout.write(f"Article {batch.article_id} submitted as batch {batch.id}")
+                
+        except Exception as e:
+            logger.error("Failed to submit articles for analysis", exc_info=e)
+            self.stdout.write(
+                self.style.ERROR(f"Failed to submit articles: {str(e)}")
+            )
+            raise

This implementation:

  1. Adds proper error handling and logging
  2. Shows progress for large datasets
  3. Includes type hints
  4. Provides better user feedback
  5. Follows Django's management command best practices

Committable suggestion was skipped due to low confidence.

5 changes: 4 additions & 1 deletion ipno/news_articles/models/news_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ class NewsArticle(TimeStampsModel):
published_date = models.DateField()
author = models.CharField(max_length=255, blank=True, null=True)
url = models.CharField(max_length=255, blank=True, null=True)
is_hidden = models.BooleanField(default=False)
is_hidden = models.BooleanField(default=True) # Default to hidden until LLM processed
hide_reason = models.CharField(max_length=50, null=True, blank=True)

is_processed = models.BooleanField(default=False)
is_llm_processed = models.BooleanField(default=False)
llm_analysis_result = models.JSONField(null=True, blank=True)
Comment on lines +18 to +19
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add schema validation for LLM analysis results.

The JSONField needs schema validation to ensure consistent structure of LLM outputs. Consider using a custom field validator or Pydantic model.

Here's a suggested implementation:

from django.core.exceptions import ValidationError
from pydantic import BaseModel, Field

class LLMAnalysisSchema(BaseModel):
    is_relevant: bool = Field(..., description="Whether the article is relevant to police misconduct")
    confidence_score: float = Field(..., ge=0, le=1, description="Confidence score of the analysis")
    categories: list[str] = Field(default_list=[], description="Categories of misconduct identified")
    summary: str = Field(..., max_length=500, description="Brief summary of relevant content")

def validate_llm_result(value):
    if value is not None:
        try:
            LLMAnalysisSchema.model_validate(value)
        except Exception as e:
            raise ValidationError(f"Invalid LLM analysis result format: {str(e)}")

class NewsArticle(TimeStampsModel):
    # ... other fields ...
    llm_analysis_result = models.JSONField(
        null=True,
        blank=True,
        validators=[validate_llm_result]
    )

This ensures:

  1. Type safety for LLM outputs
  2. Consistent structure across all articles
  3. Validation of confidence scores and other fields

source = models.ForeignKey(
"news_articles.NewsArticleSource",
null=True,
Expand Down
177 changes: 177 additions & 0 deletions ipno/news_articles/services/process_llm_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import json
from typing import List
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Remove unused import List

The List import from the typing module is not used in this file. Removing unused imports helps keep the code clean and improves readability.

Apply this diff to remove the unused import:

-from typing import List
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from typing import List
🧰 Tools
🪛 Ruff

2-2: typing.List imported but unused

Remove unused import: typing.List

(F401)

from django.conf import settings
from django.utils import timezone
from pydantic import BaseModel, Field
from openai import OpenAI
from news_articles.models import NewsArticle


class MisconductAnalysis(BaseModel):
contains_misconduct: bool = Field(description="Indicates if police misconduct is mentioned in the article")
confidence_score: float = Field(ge=0, le=1, description="Confidence score between 0 and 1")
explanation: str = Field(description="Brief explanation of the decision")

@classmethod
def model_json_schema(cls, *args, **kwargs) -> dict:
"""Print and return the JSON schema that will be used by OpenAI."""
schema = super().model_json_schema(*args, **kwargs)
print("\nGenerated OpenAI Schema:")
print("------------------------")
print("type:", schema.get("type"))
print("properties:", schema.get("properties"))
print("required:", schema.get("required"))
print("additionalProperties:", schema.get("additionalProperties", False))
print("------------------------\n")
return schema


class ProcessLLMAnalysis:
def __init__(self):
self.client = OpenAI(api_key=settings.OPENAI_API_KEY)

def _create_single_request(self, article: NewsArticle) -> str:
"""Create a JSONL file for a single article batch request."""
request = {
"custom_id": str(article.id),
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "gpt-4o-mini",
"messages": [
{"role": "system", "content": "You are a helpful assistant analyzing news articles for police misconduct content."},
{"role": "user", "content": f"Analyze the following news article text and determine if it contains any reference to police misconduct.\n\nArticle text:\n{article.content}"}
],
"temperature": 0,
"response_format": {
"type": "json_schema",
"json_schema": MisconductAnalysis.model_json_schema()
}
}
}

filename = f"batch_request_{article.id}_{timezone.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
with open(filename, 'w') as f:
f.write(json.dumps(request))

return filename

def _process_batch_results(self, batch_id: str):
"""Process results from a completed batch."""
batch = self.client.batches.retrieve(batch_id)

if batch.status == "completed" and batch.output_file_id:
# Download and process results
output = self.client.files.retrieve_content(batch.output_file_id)

for line in output.splitlines():
result = json.loads(line)
article_id = result['custom_id']
response = result['response']

try:
article = NewsArticle.objects.get(id=article_id)

if response.get('error'):
print(f"Error processing article {article_id}: {response['error']}")
article.hide_reason = "LLM processing error"
article.save()
continue

analysis_result = response['body']['choices'][0]['message']['content']
article.llm_analysis_result = analysis_result
article.is_llm_processed = True

# Unhide article if misconduct confidence is above threshold
if analysis_result['confidence_score'] >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
article.is_hidden = False
else:
article.hide_reason = "Below confidence threshold"

article.save()
Comment on lines +81 to +91
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Parse the analysis result before accessing its fields

The content field retrieved from response['body']['choices'][0]['message']['content'] is likely a JSON-formatted string. To access its fields such as 'confidence_score', you need to parse it into a Python dictionary or use the MisconductAnalysis model to parse it.

Apply this diff to parse the content using the MisconductAnalysis model:

                     analysis_result = response['body']['choices'][0]['message']['content']
+                    analysis_data = MisconductAnalysis.parse_raw(analysis_result)
                     article.llm_analysis_result = analysis_result
                     article.is_llm_processed = True

                     # Unhide article if misconduct confidence is above threshold
-                    if analysis_result['confidence_score'] >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
+                    if analysis_data.confidence_score >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
                         article.is_hidden = False
                     else:
                         article.hide_reason = "Below confidence threshold"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
analysis_result = response['body']['choices'][0]['message']['content']
article.llm_analysis_result = analysis_result
article.is_llm_processed = True
# Unhide article if misconduct confidence is above threshold
if analysis_result['confidence_score'] >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
article.is_hidden = False
else:
article.hide_reason = "Below confidence threshold"
article.save()
analysis_result = response['body']['choices'][0]['message']['content']
analysis_data = MisconductAnalysis.parse_raw(analysis_result)
article.llm_analysis_result = analysis_result
article.is_llm_processed = True
# Unhide article if misconduct confidence is above threshold
if analysis_data.confidence_score >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
article.is_hidden = False
else:
article.hide_reason = "Below confidence threshold"
article.save()


except NewsArticle.DoesNotExist:
print(f"Article {article_id} not found")
except Exception as e:
print(f"Error processing result for article {article_id}: {str(e)}")

Comment on lines +59 to +97
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Refactor duplicate code in result processing methods

The methods _process_batch_results and process_completed_batches contain similar logic for processing batch results. Consider refactoring the common code into a separate method or utility function to enhance maintainability and reduce code duplication.

Also applies to: 137-175

def _cleanup_file(self, filename: str):
"""Clean up temporary files."""
import os
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Move import os to the top of the file

It's a best practice to place all import statements at the beginning of the file. This improves code organization and readability.

Apply this diff to move the import:

+import os

And remove import os from line 100.

Committable suggestion was skipped due to low confidence.

try:
os.remove(filename)
except Exception as e:
print(f"Error cleaning up file {filename}: {str(e)}")

def submit_unanalyzed_articles(self):
"""Submit each unanalyzed article as a single-item batch."""
articles = NewsArticle.objects.filter(is_llm_processed=False)
submitted_batch_ids = []

for article in articles:
try:
# Create and upload batch file for single article
batch_file = self._create_single_request(article)
file = self.client.files.create(
file=open(batch_file, "rb"),
purpose="batch"
)
Comment on lines +114 to +118
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Use a context manager when opening files

When opening files, it's recommended to use a context manager (with statement) to ensure the file is properly closed after its use, even if an error occurs.

Apply this diff to use a context manager:

                 batch_file = self._create_single_request(article)
-                file = self.client.files.create(
-                    file=open(batch_file, "rb"),
-                    purpose="batch"
-                )
+                with open(batch_file, "rb") as f:
+                    file = self.client.files.create(
+                        file=f,
+                        purpose="batch"
+                    )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
batch_file = self._create_single_request(article)
file = self.client.files.create(
file=open(batch_file, "rb"),
purpose="batch"
)
batch_file = self._create_single_request(article)
with open(batch_file, "rb") as f:
file = self.client.files.create(
file=f,
purpose="batch"
)
🧰 Tools
🪛 Ruff

116-116: Use a context manager for opening files

(SIM115)


# Create batch processing job
batch = self.client.batches.create(
input_file_id=file.id,
endpoint="/v1/chat/completions",
completion_window="24h"
)

submitted_batch_ids.append((batch.id, article.id))
self._cleanup_file(batch_file)

except Exception as e:
print(f"Error submitting article {article.id}: {str(e)}")
if 'batch_file' in locals():
self._cleanup_file(batch_file)

return submitted_batch_ids

def process_completed_batches(self):
"""Process results from completed single-item batches."""
batches = self.client.batches.list(limit=100)

for batch in batches.data:
if batch.status == "completed" and batch.output_file_id:
try:
output = self.client.files.retrieve_content(batch.output_file_id)
result = json.loads(output) # Single item, no need to iterate
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Ensure correct parsing of batch output

In process_completed_batches, the batch output is assumed to be a single JSON object, but the output may contain multiple JSON lines similar to _process_batch_results. Please verify the output format and adjust the parsing accordingly.

Consider updating the code to process each line individually:

-                result = json.loads(output)  # Single item, no need to iterate
+                for line in output.splitlines():
+                    result = json.loads(line)

Committable suggestion was skipped due to low confidence.


article_id = result['custom_id']
response = result['response']

try:
article = NewsArticle.objects.get(id=article_id)

if response.get('error'):
print(f"Error processing article {article_id}: {response['error']}")
article.hide_reason = "LLM processing error"
article.save()
continue

analysis_result = response['body']['choices'][0]['message']['content']
article.llm_analysis_result = analysis_result
article.is_llm_processed = True

# Unhide article if misconduct confidence is above threshold
if analysis_result['confidence_score'] >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
article.is_hidden = False
else:
article.hide_reason = "Below confidence threshold"

article.save()
Comment on lines +159 to +169
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Parse the analysis result before accessing its fields

As with the previous method, parse the content field to access the analysis data properly.

Apply this diff to parse the content:

                         analysis_result = response['body']['choices'][0]['message']['content']
+                        analysis_data = MisconductAnalysis.parse_raw(analysis_result)
                         article.llm_analysis_result = analysis_result
                         article.is_llm_processed = True

                         # Unhide article if misconduct confidence is above threshold
-                        if analysis_result['confidence_score'] >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
+                        if analysis_data.confidence_score >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
                             article.is_hidden = False
                         else:
                             article.hide_reason = "Below confidence threshold"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
analysis_result = response['body']['choices'][0]['message']['content']
article.llm_analysis_result = analysis_result
article.is_llm_processed = True
# Unhide article if misconduct confidence is above threshold
if analysis_result['confidence_score'] >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
article.is_hidden = False
else:
article.hide_reason = "Below confidence threshold"
article.save()
analysis_result = response['body']['choices'][0]['message']['content']
analysis_data = MisconductAnalysis.parse_raw(analysis_result)
article.llm_analysis_result = analysis_result
article.is_llm_processed = True
# Unhide article if misconduct confidence is above threshold
if analysis_data.confidence_score >= settings.OPENAI_MISCONDUCT_CONFIDENCE_THRESHOLD:
article.is_hidden = False
else:
article.hide_reason = "Below confidence threshold"
article.save()


except NewsArticle.DoesNotExist:
print(f"Article {article_id} not found")
except Exception as e:
print(f"Error processing result for article {article_id}: {str(e)}")

except Exception as e:
print(f"Error processing batch {batch.id}: {str(e)}")
16 changes: 15 additions & 1 deletion ipno/tasks/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
HOURLY_TASK = "hourly_task"
DAILY_TASK = "daily_task"

TASK_TYPES = ((DAILY_TASK, "Daily task"),)
TASK_TYPES = (
(HOURLY_TASK, "Hourly task"),
(DAILY_TASK, "Daily task"),
)

APP_TASKS = [
{
Expand All @@ -13,6 +17,16 @@
"command": "run_news_articles_crawlers",
"task_type": DAILY_TASK,
},
{
"task_name": "Submit news articles for LLM analysis",
"command": "submit_llm_analysis",
"task_type": HOURLY_TASK, # Submit new articles frequently
},
{
"task_name": "Process completed LLM analysis batches",
"command": "process_llm_batches",
"task_type": DAILY_TASK, # Process results once per day
},
{
"task_name": "Run news articles' officers matching",
"command": "run_news_articles_officers_matching",
Expand Down
4 changes: 3 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ celery[redis]==5.2.7
django-celery-results==2.4.0
cryptography==38.0.4
django-db-geventpool==4.0.1
slack_sdk==3.19.5
slack_sdk==3.19.5
openai==1.3.0
pydantic>=2.0.0
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Pin pydantic to specific version for stability.

Using >= could lead to unexpected behavior if a new version introduces breaking changes. Since Pydantic 2.x has significant changes from 1.x, it's recommended to pin to a specific version.

Apply this change:

-pydantic>=2.0.0
+pydantic==2.5.2
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
pydantic>=2.0.0
pydantic==2.5.2