Skip to content

Commit

Permalink
Nick: fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
nickscamara committed Jan 4, 2025
1 parent a4f7c38 commit c655c68
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 43 deletions.
7 changes: 7 additions & 0 deletions apps/api/src/lib/canonical-url.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
export function normalizeUrl(url: string) {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
}
101 changes: 58 additions & 43 deletions apps/api/src/services/queue-worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { indexPage } from "../lib/extract/index/pinecone";
import { Document } from "../controllers/v1/types";
import { supabase_service } from "../services/supabase";
import { normalizeUrl } from "../lib/canonical-url";

configDotenv();

Expand Down Expand Up @@ -78,54 +79,68 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;

async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
if (await finishCrawl(job.data.crawl_id)) {
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers("crawl:" + job.data.crawl_id + ":visited");

// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null) {
try {
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from('crawl_maps')
.select('urls')
.eq('origin_url', sc.originUrl)
.single();

if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [...new Set([...existingMap.urls, ...visitedUrls])];

const { error } = await supabase_service
.from('crawl_maps')
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString()
})
.eq('origin_url', sc.originUrl);

if (error) {
_logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service
.from('crawl_maps')
.insert({
origin_url: sc.originUrl,
urls: visitedUrls,
num_urls: visitedUrls.length,
created_at: new Date().toISOString()
(async () => {
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
// Get all visited URLs from Redis
const visitedUrls = await redisConnection.smembers(
"crawl:" + job.data.crawl_id + ":visited",
);
// Upload to Supabase if we have URLs and this is a crawl (not a batch scrape)
if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) {
// Fire and forget the upload to Supabase
try {
// Standardize URLs to canonical form (https, no www)
const standardizedUrls = [
...new Set(
visitedUrls.map((url) => {
return normalizeUrl(url);
}),
),
];
// First check if entry exists for this origin URL
const { data: existingMap } = await supabase_service
.from("crawl_maps")
.select("urls")
.eq("origin_url", originUrl)
.single();

if (existingMap) {
// Merge URLs, removing duplicates
const mergedUrls = [
...new Set([...existingMap.urls, ...standardizedUrls]),
];

const { error } = await supabase_service
.from("crawl_maps")
.update({
urls: mergedUrls,
num_urls: mergedUrls.length,
updated_at: new Date().toISOString(),
})
.eq("origin_url", originUrl);

if (error) {
_logger.error("Failed to update crawl map", { error });
}
} else {
// Insert new entry if none exists
const { error } = await supabase_service.from("crawl_maps").insert({
origin_url: originUrl,
urls: standardizedUrls,
num_urls: standardizedUrls.length,
created_at: new Date().toISOString(),
updated_at: new Date().toISOString(),
});

if (error) {
_logger.error("Failed to save crawl map", { error });
if (error) {
_logger.error("Failed to save crawl map", { error });
}
}
} catch (error) {
_logger.error("Error saving crawl map", { error });
}
} catch (error) {
_logger.error("Error saving crawl map", { error });
}
}
})();

if (!job.data.v1) {
const jobIDs = await getCrawlJobs(job.data.crawl_id);
Expand Down

0 comments on commit c655c68

Please sign in to comment.