Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 1 addition & 90 deletions learning_resources/etl/canvas.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import base64
import logging
import zipfile
from collections.abc import Generator
from datetime import datetime
from io import BytesIO
from pathlib import Path
from tempfile import TemporaryDirectory

import pypdfium2 as pdfium
from django.conf import settings
from litellm import completion
from PIL import Image

from learning_resources.constants import (
TUTOR_PROBLEM_TYPE,
Expand Down Expand Up @@ -213,6 +208,7 @@ def transform_canvas_problem_files(
run,
overwrite=overwrite,
valid_file_types=VALID_TUTOR_PROBLEM_FILE_TYPES,
is_tutor_problem_file_import=True,
):
keys_to_keep = [
"run",
Expand Down Expand Up @@ -249,89 +245,4 @@ def transform_canvas_problem_files(
problem_file_data["content"]
)

if (
problem_file_data["file_extension"].lower() == ".pdf"
and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
and not run.problem_files.filter(
checksum=problem_file_data["checksum"]
).exists()
):
markdown_content = _pdf_to_markdown(
Path(olx_path) / Path(problem_file_data["source_path"])
)
if markdown_content:
problem_file_data["content"] = markdown_content
yield problem_file_data


def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
"""
Convert a PDF file to a list of base64 encoded images (one per page).
Resizes images to reduce file size while keeping good OCR quality.

Args:
pdf_path (str): Path to the PDF file
dpi (int): DPI for the output images (default: 200)
fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG')
max_size (int): Maximum width/height in pixels (default: 2000)
quality (int): JPEG quality (1-100, default: 85)

Returns:
list: List of base64 encoded strings (one per page)
"""

pdf = pdfium.PdfDocument(pdf_path)
for page_index in range(len(pdf)):
page = pdf.get_page(page_index)
image = page.render(scale=2).to_pil()
page.close()
# Resize the image if it's too large (preserving aspect ratio)
if max(image.size) > max_size:
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
buffered = BytesIO()
# Save with optimized settings
if fmt.upper() == "JPEG":
image.save(buffered, format="JPEG", quality=quality, optimize=True)
else: # PNG
image.save(buffered, format="PNG", optimize=True)
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
yield img_str
pdf.close()


def _pdf_to_markdown(pdf_path):
"""
Convert a PDF file to markdown using an llm
"""
markdown = ""
for im in pdf_to_base64_images(pdf_path):
response = completion(
api_base=settings.LITELLM_API_BASE,
custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": settings.CANVAS_TRANSCRIPTION_PROMPT,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{im}",
},
},
],
}
],
)
markdown_snippet = (
response.json()["choices"][0]["message"]["content"]
.removeprefix("```markdown\n")
.removesuffix("\n```")
)

markdown += markdown_snippet
return markdown
141 changes: 74 additions & 67 deletions learning_resources/etl/canvas_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)
from learning_resources.models import LearningResource
from learning_resources_search.constants import CONTENT_FILE_TYPE
from main.utils import checksum_for_content, now_in_utc
from main.utils import now_in_utc

pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -438,11 +438,14 @@ def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_
bulk_unpub.assert_called_once_with([unpublished_cf.id], CONTENT_FILE_TYPE)


@pytest.mark.parametrize("overwrite", [True, False])
@pytest.mark.parametrize("existing_file", [True, False])
def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
tmp_path, mocker, settings
tmp_path, mocker, settings, overwrite, existing_file
):
"""
Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files.
if overwrite is True or there is no existing file. Tikka should not be called
"""

settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
Expand All @@ -456,37 +459,66 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
# return a file with pdf extension
fake_file_data = {
"run": "run",
"content": "original pdf content",
"content_type": "application/pdf",
"archive_checksum": "checksum",
"source_path": f"tutorbot/{pdf_filename}",
"file_extension": ".pdf",
}

mocker.patch(
"learning_resources.etl.canvas.process_olx_path",
return_value=iter([fake_file_data]),
"learning_resources.etl.utils.documents_from_olx",
return_value=iter([[mocker.Mock(), fake_file_data]]),
)

# Patch _pdf_to_markdown to return a known value
pdf_to_md = mocker.patch(
"learning_resources.etl.canvas._pdf_to_markdown",
"learning_resources.etl.utils._pdf_to_markdown",
return_value="markdown content from pdf",
)

tika = mocker.patch(
"learning_resources.etl.utils.extract_text_metadata",
)

run = LearningResourceRunFactory.create()

results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
if existing_file:
TutorProblemFileFactory.create(
run=run,
type="problem",
archive_checksum="checksum",
source_path=f"tutorbot/{pdf_filename}",
content="existing content",
file_name="problem1.pdf",
)

results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite))

pdf_to_md.assert_called_once()
assert results[0]["content"] == "markdown content from pdf"
if overwrite or not existing_file:
pdf_to_md.assert_called_once()
else:
pdf_to_md.assert_not_called()

tika.assert_not_called()

assert (
results[0]["content"] == "markdown content from pdf"
if overwrite or not existing_file
else "existing content"
)
assert results[0]["problem_title"] == "problemset1"


@pytest.mark.django_db
@pytest.mark.parametrize("overwrite", [True, False])
@pytest.mark.parametrize("existing_file", [True, False])
def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(
tmp_path, mocker, settings
tmp_path, mocker, settings, overwrite, existing_file
):
"""
Test that transform_canvas_problem_files does not call _pdf_to_markdown for non-PDF files.
Test that transform_canvas_problem_files does not call _pdf_to_markdown but calles tika for
non-PDF files. Niether tika or _pdf_to_markdown should be called if overwrite is false and
there is an existing file.
"""
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
Expand All @@ -498,24 +530,48 @@ def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(

fake_file_data = {
"run": "run",
"content": csv_content,
"content_type": "application/csv",
"archive_checksum": "checksum",
"source_path": f"tutorbot/{csv_filename}",
"file_extension": ".csv",
}

mocker.patch(
"learning_resources.etl.canvas.process_olx_path",
return_value=iter([fake_file_data]),
"learning_resources.etl.utils.documents_from_olx",
return_value=iter([[mocker.Mock(), fake_file_data]]),
)

pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
pdf_to_md = mocker.patch("learning_resources.etl.utils._pdf_to_markdown")

run = mocker.Mock()
tika = mocker.patch(
"learning_resources.etl.utils.extract_text_metadata",
return_value={"content": csv_content},
)

run = LearningResourceRunFactory.create()

if existing_file:
TutorProblemFileFactory.create(
run=run,
type="problem",
archive_checksum="checksum",
source_path=f"tutorbot/{csv_filename}",
content="existing content",
file_name="problem2.csv",
)

results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite))

pdf_to_md.assert_not_called()
assert results[0]["content"] == csv_content
if overwrite or not existing_file:
tika.assert_called_once()
else:
tika.assert_not_called()
assert (
results[0]["content"] == csv_content
if overwrite or not existing_file
else "existing content"
)
assert results[0]["problem_title"] == "problemset2"


Expand Down Expand Up @@ -1535,52 +1591,3 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
}
published = get_published_items(zip_path, url_config)
assert Path("web_resources/file1.pdf").resolve() in published


def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists(
tmp_path, mocker, settings
):
"""
Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists.
"""
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
pdf_filename = "problemset3/problem.pdf"
pdf_content = b"%PDF-1.4 fake pdf content"
zip_path = make_canvas_zip(
tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)]
)

original_pdf_content = "original pdf content"
existing_checksum = checksum_for_content(original_pdf_content)

mock_run = LearningResourceRunFactory.create()
TutorProblemFileFactory.create(
run=mock_run,
problem_title="Problem Set 1",
type="problem",
checksum=existing_checksum,
)

fake_file_data = {
"run": mock_run,
"content": original_pdf_content,
"archive_checksum": "checksum",
"source_path": f"tutorbot/{pdf_filename}",
"file_extension": ".pdf",
}

mocker.patch(
"learning_resources.etl.canvas.process_olx_path",
return_value=iter([fake_file_data]),
)

pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")

results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True))

pdf_to_md.assert_not_called()

assert len(results) == 1
assert results[0]["content"] == "original pdf content"
assert results[0]["source_path"] == f"tutorbot/{pdf_filename}"
Loading
Loading