Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion controller/src/db/schema/validations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ export type insertGitRepo = InferInsertModel<typeof schema.gitRepo>;

// ========= renderings
export const insertRenderingSchema = createInsertSchema(schema.rendering, {
fileSizeBytes: (schema) => z.number().nonnegative(),
fileSizeBytes: (schema) => schema.nonnegative(),
});
export const selectRenderingSchema = createSelectSchema(schema.rendering);
export type renderingType = InferSelectModel<typeof schema.rendering>;
Expand All @@ -95,6 +95,9 @@ export const selectScripturalRenderingMetadataSchema = createSelectSchema(
export type scripturalRenderingMetadataType = InferSelectModel<
typeof schema.scripturalRenderingMetadata
>;
export type scripturalRenderingMetadataInsert = InferSelectModel<
typeof schema.scripturalRenderingMetadata
>;
export type insertScripturalRenderingMetadata = InferInsertModel<
typeof schema.scripturalRenderingMetadata
>;
Expand Down
266 changes: 266 additions & 0 deletions controller/src/functions/glUsfmSources.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
import {app, InvocationContext, Timer} from "@azure/functions";
import {getDb as startDb} from "../db/config";
import {onConflictSetAllFieldsToSqlExcluded} from "../utils";
import * as dbSchema from "../db/schema/schema";
import {eq, and, isNotNull, like, notInArray, ne} from "drizzle-orm";
import {parse as yamlParse} from "yaml";
import {basename} from "path";

const db = startDb();

export async function populateGlUsfmSources(
myTimer: Timer,
context: InvocationContext
): Promise<void> {
context.log(
`Timer function processed request. at ${new Date()}. Next is ${
myTimer.scheduleStatus?.next
}`
);
try {
await getApplicableContentRows(context);
} catch (error) {
context.error(error);
}
}

async function getApplicableContentRows(context: InvocationContext) {
let WACS_API_URL = process.env.WACS_API_URL;
if (!WACS_API_URL) {
throw new Error("WACS_API_URL is not set");
}
let rows = await getActiveUsfmScriptureContent();
context.log(`There are ${rows.length} gateway content rows`);

// let testCase = res.slice(0, 5);
let apiBaseUrl = `${WACS_API_URL}/repos`;

let counter = 0;
for await (const row of rows) {
try {
const baseArgs = {
baseUrl: apiBaseUrl,
// coerce bc the query for these rows has an isNotNull on these two props.
user: row.gitUser!,
repo: row.gitRepo!,
};
// We need the lastest sha on master in order to make a call for the tree
const sha = await getWacsContentSha(baseArgs);
// from the tree, we only want the usfm paths
const usfmFiles = await getWacsUsfmFromTree({
...baseArgs,
sha,
});
if (!usfmFiles) continue;

// This loop is serially because I was having problem with doing a batch insert with CTE's causing duplicate parts of the query to reserve the same serial id which was causing the insert to fail. It's fine to just do this as a cron and not stress about the prf of the serial async loop though for this.
for await (const f of usfmFiles) {
const bookSlug = f.bookSlug.toUpperCase();
let bookName = f.bookName;
let rawUrl = `${apiBaseUrl}/${row.gitUser}/${
row.gitRepo
}/raw/${encodeURIComponent(f.path)}`;
await transactDbRowForUsfm({
bookName,
bookSlug,
contentId: row.contentId,
size: f.size,
rawUrl,
sha,
});
}
counter++;
if (counter % 5 === 0) {
console.log(`counter: ${counter}`);
}
} catch (e) {
context.error(row.gitUser, row.gitRepo, e);
}
}
// return res;
}

async function getActiveUsfmScriptureContent() {
const {
rendering,
content,
language,
gitRepo,
waLangMetadata,
waContentMetadata,
} = dbSchema;
let sourceUsfmSubQuery = db
.select({
contentId: rendering.contentId,
})
.from(rendering)
// source.usfm is a file produced for btt writer repos. There are some random writer repos that are in gls, but we only want those which have been combined into a single repo. And there's already usfm for those, so no need to double up
.where(like(rendering.url, "%source.usfm"));

let res = await db
.select({
gitUser: gitRepo.username,
gitRepo: gitRepo.repoName,
contentId: content.id,
languageId: content.languageId,
})
.from(content)
.leftJoin(language, eq(content.languageId, language.ietfCode))
.leftJoin(waLangMetadata, eq(language.ietfCode, waLangMetadata.ietfCode))
.leftJoin(gitRepo, eq(content.id, gitRepo.contentId))
.leftJoin(waContentMetadata, eq(waContentMetadata.contentId, content.id))
.where(
and(
// get hls
eq(waLangMetadata.isGateway, true),
// usfm only applies to scripture
eq(content.domain, "scripture"),
// not in source.usfm
notInArray(content.id, sourceUsfmSubQuery),
// has to be on wacs to use wacs api for getting raw files
isNotNull(gitRepo.username),
isNotNull(gitRepo.repoName),
// even for hls, there's a lot of noise of wacs repos that are marked as inactive that aren't likely interesting. So ignore those
ne(waContentMetadata.status, "Inactive")
)
);
return res;
}

type WacsArgs = {
baseUrl: string;
user: string;
repo: string;
};
type WacsArgsAndSha = WacsArgs & {
sha: string;
};

async function getWacsContentSha({baseUrl, user, repo}: WacsArgs) {
const res = await fetch(`${baseUrl}/${user}/${repo}/git/refs/heads/master`);
const body = await res.json();
return body[0]?.object?.sha;
}
async function getWacsUsfmFromTree({
baseUrl,
user,
repo,
sha,
}: WacsArgsAndSha): Promise<null | Array<{
path: string;
sha: string;
url: string;
bookName: string;
bookSlug: string;
size: number;
}>> {
const treeRes = await fetch(
`${baseUrl}/${user}/${repo}/git/trees/${sha}?recursive=true`
);
const treeBody = await treeRes.json();
const usfmFiles = treeBody?.tree?.filter(
(f: any) => f.path.endsWith("usfm") && f.type === "blob"
);
const manifest = treeBody?.tree?.find((f: any) =>
f.path.includes("manifest")
);
if (!manifest) return null;
const manifestType = manifest.path.includes("yaml")
? "yaml"
: manifest.path.includes("json")
? "json"
: null;
// ecosystem right now should only have yaml mostly, and a few json(from bttwriter, but those are partial usually, so json really shouldn't even be getting through here based on the filter above)
if (!manifestType) return null;
const manifestRes = await fetch(manifest.url);
const manifestBody = await manifestRes.json();
const content = Buffer.from(manifestBody.content, "base64").toString();
let parsed = null;
if (manifestType === "yaml") {
parsed = yamlParse(content);
} else if (manifestType === "json") {
parsed = JSON.parse(content);
}
// this is a resource container path that maps projects to file paths.
const projects = parsed?.projects;
if (projects) {
usfmFiles?.forEach((f: any) => {
const match = projects.find((p: any) =>
p.path.toLowerCase().includes(basename(f.path.toLowerCase(), "usfm"))
);
if (match?.title) {
f.bookName = match.title.trim();
f.bookSlug = match.identifier?.trim();
}
});
}
return usfmFiles;
}

type TransactDbRowForUsfmArgs = {
contentId: string;
size: number;
rawUrl: string;
sha: string;
bookSlug: string;
bookName: string | null | undefined;
};
async function transactDbRowForUsfm({
contentId,
size,
rawUrl,
sha,
bookName,
bookSlug,
}: TransactDbRowForUsfmArgs) {
const {scripturalRenderingMetadata, rendering} = dbSchema;
return await db.transaction(async (tx) => {
const renderedRow = await tx
.insert(rendering)
.values({
contentId: contentId,
fileType: "usfm",
fileSizeBytes: size,
url: rawUrl,
hash: sha,
})
.onConflictDoUpdate({
target: [rendering.url],
set: onConflictSetAllFieldsToSqlExcluded(rendering, [
"contentId",
"modifiedOn",
]),
})
.returning({
id: rendering.id,
});
// meta row insertion
await tx
.insert(scripturalRenderingMetadata)
.values({
bookSlug: bookSlug,
bookName: bookName,
isWholeBook: true,
isWholeProject: false,
chapter: null,
renderingId: renderedRow[0]?.id,
})
.onConflictDoUpdate({
target: [scripturalRenderingMetadata.renderingId],
set: onConflictSetAllFieldsToSqlExcluded(scripturalRenderingMetadata),
});
});
}

app.timer("populateGlUsfmSources", {
// Once every 12 hours.
schedule: "0 3 */12 * * *",
handler: populateGlUsfmSources,
useMonitor: process.env.NODE_ENV?.toUpperCase() == "DEV" ? false : true,
});

/*
`https://content.bibletranslationtools.org/api/v1/repos/${
row.gitUser
}/${row.gitRepo}/raw/${encodeURIComponent(f.path)}`;
*/
5 changes: 3 additions & 2 deletions controller/src/functions/localization.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const table = insertSchemas.localization.table;

// you can run azrue crons manually by invoking
// http://localhost:7071/admin/functions/manageLocalizationTable. (or the function name below). You must use a post request, header of Content-Type application/json, and the body as follows { "input": "anything"}. The body doesn't actually matter
// Secondly, if you want to run only one azure function without modifying host.json (which does get deployed), if you have func installed, you can do func start --functions <functionName>
export async function populateLocalization(
myTimer: Timer,
context: InvocationContext
Expand Down Expand Up @@ -166,9 +167,9 @@ async function populateScripturalBookNames() {
}

app.timer("manageLocalizationTable", {
schedule: "*/30 * * * * *",
schedule: "1 1 */4 * * * ",
handler: populateLocalization,
useMonitor: false,
useMonitor: process.env.NODE_ENV?.toUpperCase() == "DEV" ? false : true,
});

// For resource types,