Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/openapi/parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ base64PageUrl:
schema:
type: string
format: base64url
base64Url:
name: base64Url
description: Base64-encoded URL
in: path
required: true
schema:
type: string
format: base64url
ascending:
name: ascending
description: Whether to sort ascending or descending
Expand Down
34 changes: 34 additions & 0 deletions docs/openapi/schemas.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2225,6 +2225,40 @@ ScrapeJobsByBaseURLResponse:
items:
$ref: '#/CreateAsyncJobAcceptedResponse'

ScrapeUrl:
type: object
properties:
url:
description: The URL that was scraped
type: string
processingType:
description: The processing type/handler used for this scrape
type: string
status:
description: The status of the scrape
type: string
enum:
- 'COMPLETE'
- 'FAILED'
- 'PENDING'
path:
description: The S3 path of the scraped content if the scrape was successful
type: string
createdAt:
description: The timestamp when the scrape was created
$ref: '#/DateTime'
example:
url: 'https://example.com/page1'
processingType: 'form'
status: 'COMPLETE'
path: 'scrapes/12d6ac8e-d5e4-4788-90eb-b69e10e74abc/page1.html'
createdAt: '2024-01-01T10:00:00.000Z'

ScrapeUrlsByProcessingTypeResponse:
type: array
items:
$ref: '#/ScrapeUrl'

BrokenBacklink:
type: object
properties:
Expand Down
28 changes: 28 additions & 0 deletions docs/openapi/scrape-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,34 @@ get-scrape-jobs-by-base-url-and-processing-type:
'500':
$ref: './responses.yaml#/500'

get-scrape-urls-by-url-and-processing-type:
parameters:
- $ref: './parameters.yaml#/base64Url'
- $ref: './parameters.yaml#/processingType'
get:
tags:
- scrape
security:
- api_key: [ ]
summary: Get Scrape URLs by URL and Processing Type
description: |
⚠️ **EXPERIMENTAL / PROTOTYPE**
This endpoint is used to retrieve scraped URLs by a specific URL and processing type.
The URL and processingType must be provided in the path parameter. (see samples)
Returns an array of scrape URL results sorted by creation date (newest first).
operationId: getScrapeUrlByProcessingType
responses:
'200':
description: Scrape URL results
content:
application/json:
schema:
$ref: './schemas.yaml#/ScrapeUrlsByProcessingTypeResponse'
'400':
$ref: './responses.yaml#/400'
'500':
$ref: './responses.yaml#/500'

get-scraped-content-list:
parameters:
- $ref: './parameters.yaml#/siteId'
Expand Down
31 changes: 31 additions & 0 deletions src/controllers/scrapeJob.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ function ScrapeJobController(context) {
endDate: requestContext.params.endDate,
baseURL: requestContext.params.baseURL,
processingType: requestContext.params.processingType,
url: requestContext.params.url,
maxAge: requestContext.params.maxAge,
};
}

Expand Down Expand Up @@ -193,12 +195,41 @@ function ScrapeJobController(context) {
}
}

async function getScrapeUrlByProcessingType(requestContext) {
const { url: encodedUrl, processingType } = parseRequestContext(requestContext);

if (!hasText(encodedUrl)) {
return badRequest('A valid URL is required');
} else if (!hasText(processingType)) {
return badRequest('A processing type is required');
}

let decodedUrl = encodedUrl;
try {
decodedUrl = Buffer.from(encodedUrl, 'base64').toString('utf-8').trim();
const scrapeUrls = await scrapeClient.getScrapeUrlsByProcessingType(
decodedUrl,
processingType,
);

if (!scrapeUrls || scrapeUrls.length === 0) {
return ok([]);
}

return ok(scrapeUrls.sort((a, b) => new Date(b.createdAt) - new Date(a.createdAt)));
} catch (error) {
log.error(`Failed to fetch scrape URLs for url: ${decodedUrl} and processingType: ${processingType}, ${error.message}`);
return createErrorResponse(error);
}
}

return {
createScrapeJob,
getScrapeJobStatus,
getScrapeJobUrlResults,
getScrapeJobsByBaseURL,
getScrapeJobsByDateRange,
getScrapeUrlByProcessingType,
};
}

Expand Down
1 change: 1 addition & 0 deletions src/routes/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ export default function getRouteHandlers(
'GET /tools/scrape/jobs/by-date-range/:startDate/:endDate/all-jobs': scrapeJobController.getScrapeJobsByDateRange,
'GET /tools/scrape/jobs/by-base-url/:baseURL': scrapeJobController.getScrapeJobsByBaseURL,
'GET /tools/scrape/jobs/by-base-url/:baseURL/by-processingtype/:processingType': scrapeJobController.getScrapeJobsByBaseURL,
'GET /tools/scrape/jobs/by-url/:url/:processingType': scrapeJobController.getScrapeUrlByProcessingType,

// Fixes
'GET /sites/:siteId/opportunities/:opportunityId/fixes': (c) => fixesController.getAllForOpportunity(c),
Expand Down
111 changes: 111 additions & 0 deletions test/controllers/scrape-job.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ describe('ScrapeJobController tests', () => {
},
ScrapeUrl: {
allByScrapeJobId: sandbox.stub().resolves([]),
allRecentByUrlAndProcessingType: sandbox.stub().resolves([]),
create: (data) => createScrapeUrl(data),
},
};
Expand Down Expand Up @@ -641,4 +642,114 @@ describe('ScrapeJobController tests', () => {
expect(response.headers.get('x-error')).to.equal('Base URL required');
});
});

describe('getScrapeUrlByProcessingType', () => {
const encodedUrl = Buffer.from('https://www.example.com/page1').toString('base64');

beforeEach(() => {
scrapeJobController = ScrapeJobController(baseContext);
});

it('should return 400 when URL is not provided', async () => {
baseContext.params.url = '';
baseContext.params.processingType = 'form';

const response = await scrapeJobController.getScrapeUrlByProcessingType(baseContext);
expect(response).to.be.an.instanceOf(Response);
expect(response.status).to.equal(400);
expect(response.headers.get('x-error')).to.equal('A valid URL is required');
});

it('should return 400 when processingType is not provided', async () => {
baseContext.params.url = encodedUrl;
baseContext.params.processingType = '';

const response = await scrapeJobController.getScrapeUrlByProcessingType(baseContext);
expect(response).to.be.an.instanceOf(Response);
expect(response.status).to.equal(400);
expect(response.headers.get('x-error')).to.equal('A processing type is required');
});

it('should return empty array when no scrape URLs are found', async () => {
// eslint-disable-next-line max-len
baseContext.dataAccess.ScrapeUrl.allRecentByUrlAndProcessingType = sandbox.stub().resolves([]);
baseContext.params.url = encodedUrl;
baseContext.params.processingType = 'form';

const response = await scrapeJobController.getScrapeUrlByProcessingType(baseContext);
expect(response).to.be.an.instanceOf(Response);
expect(response.status).to.equal(200);
const result = await response.json();
expect(result).to.deep.equal([]);
});

it('should return sorted scrape URLs when results are found', async () => {
const mockScrapeUrls = [
createScrapeUrl({
url: 'https://www.example.com/page1',
processingType: 'form',
status: 'COMPLETE',
createdAt: '2024-01-01T10:00:00.000Z',
path: 'path/to/result1',
}),
createScrapeUrl({
url: 'https://www.example.com/page2',
processingType: 'form',
status: 'COMPLETE',
createdAt: '2024-01-02T10:00:00.000Z',
path: 'path/to/result2',
}),
createScrapeUrl({
url: 'https://www.example.com/page3',
processingType: 'form',
status: 'COMPLETE',
createdAt: '2024-01-01T15:00:00.000Z',
path: 'path/to/result3',
}),
];

// eslint-disable-next-line max-len
baseContext.dataAccess.ScrapeUrl.allRecentByUrlAndProcessingType = sandbox.stub().resolves(mockScrapeUrls);
baseContext.params.url = encodedUrl;
baseContext.params.processingType = 'form';

const response = await scrapeJobController.getScrapeUrlByProcessingType(baseContext);
expect(response).to.be.an.instanceOf(Response);
expect(response.status).to.equal(200);
const result = await response.json();

// Results should be sorted by createdAt in descending order (newest first)
expect(result).to.have.lengthOf(3);
expect(result[0].createdAt).to.equal('2024-01-02T10:00:00.000Z');
expect(result[1].createdAt).to.equal('2024-01-01T15:00:00.000Z');
expect(result[2].createdAt).to.equal('2024-01-01T10:00:00.000Z');
});

it('should handle errors gracefully', async () => {
baseContext.dataAccess.ScrapeUrl.allRecentByUrlAndProcessingType = sandbox.stub().rejects(new Error('Database connection failed'));
baseContext.params.url = encodedUrl;
baseContext.params.processingType = 'form';

const response = await scrapeJobController.getScrapeUrlByProcessingType(baseContext);
expect(response).to.be.an.instanceOf(Response);
expect(response.status).to.equal(500);
expect(response.headers.get('x-error')).to.equal('Failed to fetch scrape URL by URL: https://www.example.com/page1 and processing type: form, Database connection failed');
});

it('should decode base64 URL correctly', async () => {
const allRecentByUrlAndProcessingTypeStub = sandbox.stub().resolves([]);
// eslint-disable-next-line max-len
baseContext.dataAccess.ScrapeUrl.allRecentByUrlAndProcessingType = allRecentByUrlAndProcessingTypeStub;
baseContext.params.url = encodedUrl;
baseContext.params.processingType = 'form';

await scrapeJobController.getScrapeUrlByProcessingType(baseContext);

// Verify the decoded URL was passed to the data access layer
expect(allRecentByUrlAndProcessingTypeStub).to.have.been.calledOnce;
const callArgs = allRecentByUrlAndProcessingTypeStub.getCall(0).args;
expect(callArgs[0]).to.equal('https://www.example.com/page1');
expect(callArgs[1]).to.equal('form');
});
});
});
2 changes: 2 additions & 0 deletions test/routes/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ describe('getRouteHandlers', () => {
getScrapeJobResult: sinon.stub(),
getScrapeJobProgress: sinon.stub(),
getScrapeJobsByDateRange: sinon.stub(),
getScrapeUrlByProcessingType: sinon.stub(),
};

const mockApiKeyController = {
Expand Down Expand Up @@ -439,6 +440,7 @@ describe('getRouteHandlers', () => {
'DELETE /sites/:siteId/reports/:reportId',
'GET /tools/scrape/jobs/by-base-url/:baseURL',
'GET /tools/scrape/jobs/by-base-url/:baseURL/by-processingtype/:processingType',
'GET /tools/scrape/jobs/by-url/:url/:processingType',
'PATCH /sites/:siteId/config/cdn-logs',
'GET /sites/:siteId/llmo/sheet-data/:dataSource',
'GET /sites/:siteId/llmo/sheet-data/:sheetType/:dataSource',
Expand Down