Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ndlaskip as tag to not translate text #2379

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
"@ndla/video-search": "^8.0.88-alpha.0",
"@tanstack/react-query": "5.62.3",
"auth0-js": "^9.22.1",
"cheerio": "^1.0.0-rc.12",
"compression": "^1.7.4",
"date-fns": "2.30.0",
"diff-match-patch": "^1.0.5",
Expand Down
8 changes: 6 additions & 2 deletions src/components/FileUploader/FileUploader.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,18 @@ const FileUploader = ({ onFileSave, close }: Props) => {
// Bug in formik's setError function requiring setTimeout to make it work,
// as discussed here: https://github.com/jaredpalmer/formik/discussions/3870
if (fileErrors.includes("FILE_INVALID_TYPE")) {
const errorMessage = `${t("form.file.fileUpload.genericError")}: ${t("form.file.fileUpload.fileTypeInvalidError")}`;
const errorMessage = `${t("form.file.fileUpload.genericError")}: ${t(
"form.file.fileUpload.fileTypeInvalidError",
)}`;
setTimeout(() => {
helpers.setError(errorMessage);
}, 0);
return;
}
if (fileErrors.includes("TOO_MANY_FILES")) {
const errorMessage = `${t("form.file.fileUpload.genericError")}: ${t("form.file.fileUpload.tooManyError")}`;
const errorMessage = `${t("form.file.fileUpload.genericError")}: ${t(
"form.file.fileUpload.tooManyError",
)}`;
setTimeout(() => {
helpers.setError(errorMessage);
}, 0);
Expand Down
12 changes: 3 additions & 9 deletions src/components/NynorskTranslateProvider.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ export interface TranslateType {
type: "text" | "html";
}

const domParser = new DOMParser();
const xmlSerializer = new XMLSerializer();
//const domParser = new DOMParser();
//const xmlSerializer = new XMLSerializer();

export const NynorskTranslateProvider = ({ children }: Props) => {
const translateState = useState<boolean>(false);
Expand Down Expand Up @@ -62,13 +62,7 @@ export const useTranslateToNN = () => {
const content = get(element, field);
if (content) {
const isArray = Array.isArray(content);
// Our backend uses Jsoup to encode html. However, > is not encoded, and nynodata expects it to be. As such, we have to parse
// the entire html string and reencode it using an xmlSerializer.
const parsed =
type === "html" && !isArray
? xmlSerializer.serializeToString(domParser.parseFromString(content, "text/html").body!)
: content;
acc[field] = { content: parsed, type, isArray };
acc[field] = { content, type, isArray };
}
return acc;
}, {});
Expand Down
4 changes: 3 additions & 1 deletion src/components/SlateEditor/plugins/video/SlateVideo.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ const SlateVideo = ({ attributes, element, editor, children }: Props) => {
variant="secondary"
title={t("form.video.brightcove")}
aria-label={t("form.video.brightcove")}
to={`https://studio.brightcove.com/products/videocloud/media/videos/${embed.embedData.videoid.split("&t=")[0]}`}
to={`https://studio.brightcove.com/products/videocloud/media/videos/${
embed.embedData.videoid.split("&t=")[0]
}`}
size="small"
>
<LinkMedium />
Expand Down
4 changes: 3 additions & 1 deletion src/containers/AudioUploader/components/AudioContent.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ const AudioContent = () => {
const fileErrors = details.files?.[0]?.errors;
if (!fileErrors) return;
if (fileErrors.includes("FILE_INVALID_TYPE")) {
const errorMessage = `${t("form.audio.fileUpload.genericError")}: ${t("form.audio.fileUpload.fileTypeInvalidError")}`;
const errorMessage = `${t("form.audio.fileUpload.genericError")}: ${t(
"form.audio.fileUpload.fileTypeInvalidError",
)}`;
// Bug in formik's setError function requiring setTimeout to make it work,
// as discussed here: https://github.com/jaredpalmer/formik/discussions/3870
setTimeout(() => {
Expand Down
8 changes: 6 additions & 2 deletions src/containers/ImageUploader/components/ImageContent.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,18 @@ const ImageContent = ({ language }: Props) => {
const fileErrors = details.files?.[0]?.errors;
if (!fileErrors) return;
if (fileErrors.includes("FILE_TOO_LARGE")) {
const errorMessage = `${t("form.image.fileUpload.genericError")}: ${t("form.image.fileUpload.tooLargeError")}`;
const errorMessage = `${t("form.image.fileUpload.genericError")}: ${t(
"form.image.fileUpload.tooLargeError",
)}`;
setTimeout(() => {
helpers.setError(errorMessage);
}, 0);
return;
}
if (fileErrors.includes("FILE_INVALID_TYPE")) {
const errorMessage = `${t("form.image.fileUpload.genericError")}: ${t("form.image.fileUpload.fileTypeInvalidError")}`;
const errorMessage = `${t("form.image.fileUpload.genericError")}: ${t(
"form.image.fileUpload.fileTypeInvalidError",
)}`;
setTimeout(() => {
helpers.setError(errorMessage);
}, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,11 @@ const SearchContent = ({ content, locale, responsibleName }: Props) => {
target="_blank"
aria-label={t("form.workflow.published")}
title={t("form.workflow.published")}
to={`${config.ndlaFrontendDomain}/${content.learningResourceType === "concept" || content.learningResourceType === "gloss" ? "concept" : "article"}/${
content.id
}`}
to={`${config.ndlaFrontendDomain}/${
content.learningResourceType === "concept" || content.learningResourceType === "gloss"
? "concept"
: "article"
}/${content.id}`}
>
<CheckLine />
</SafeLinkIconButton>
Expand Down
12 changes: 9 additions & 3 deletions src/containers/WelcomePage/components/worklist/WorkList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,15 @@ const WorkList = ({ ndlaId }: Props) => {
}}
>
<TabsList>
<TabsTrigger value="articles">{`${t("taxonomy.resources")} (${searchQuery.data?.totalCount ?? 0})`}</TabsTrigger>
<TabsTrigger value="onHold">{`${t("welcomePage.workList.onHold")} (${searchOnHoldQuery.data?.totalCount ?? 0})`}</TabsTrigger>
<TabsTrigger value="concepts">{`${t("form.name.concepts")} (${searchConceptsQuery.data?.totalCount ?? 0})`}</TabsTrigger>
<TabsTrigger value="articles">{`${t("taxonomy.resources")} (${
searchQuery.data?.totalCount ?? 0
})`}</TabsTrigger>
<TabsTrigger value="onHold">{`${t("welcomePage.workList.onHold")} (${
searchOnHoldQuery.data?.totalCount ?? 0
})`}</TabsTrigger>
<TabsTrigger value="concepts">{`${t("form.name.concepts")} (${
searchConceptsQuery.data?.totalCount ?? 0
})`}</TabsTrigger>
<TabsIndicator />
</TabsList>
<WelcomePageTabsContent value="articles">
Expand Down
67 changes: 67 additions & 0 deletions src/server/__tests__/translate-test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* Copyright (c) 2025-present, NDLA.
*
* This source code is licensed under the GPLv3 license found in the
* LICENSE file in the root directory of this source tree.
*
*/

import { wrapDocument, unwrapDocument } from "../translate";

const htmlify = (text: string) => `<html><head></head><body>${text}</body></html>`;

test("wrapDocument wraps span with lang in ndlaskip", () => {
expect(wrapDocument("Hei")).toBe(htmlify("Hei"));
expect(wrapDocument("Dette er en <span lang='no'>språkmerka tekst</span>")).toBe(
htmlify('Dette er en <ndlaskip><span lang="no">språkmerka tekst</span></ndlaskip>'),
);
expect(wrapDocument("<ndlaembed data-attr='no'></ndlaembed>")).toBe(
htmlify('<ndlaembed data-attr="<ndlaskip>no</ndlaskip>"></ndlaembed>'),
);
expect(wrapDocument("<ndlaembed data-title='Her er en tittel som skal oversettes'></ndlaembed>")).toBe(
htmlify('<ndlaembed data-title="Her er en tittel som skal oversettes"></ndlaembed>'),
);
expect(
wrapDocument(
"<ndlaembed data-title='Her er en tittel som skal oversettes' data-alt='Og en alt som ikke skal oversettes'></ndlaembed>",
),
).toBe(
htmlify(
'<ndlaembed data-title="Her er en tittel som skal oversettes" data-alt="<ndlaskip>Og en alt som ikke skal oversettes</ndlaskip>"></ndlaembed>',
),
);

const document =
'<section><ndlaembed data-resource="image" data-resource_id="3003" data-alt="Jente sitter i bagasjerommet sammen med vesker og kofferter. Foto." data-caption="Jente <span lang=&quot;no&quot;>sitter</span> i bagasjerommet sammen med vesker og kofferter. Foto." data-is-decorative="false" data-border="false" data-align="" data-size="full" data-hide-byline="true" data-url="https://api.test.ndla.no/image-api/v2/images/3003"></ndlaembed><h2><span lang="zh">准备</span> Zhǔnbèi</h2><h3>Dette bør du kjenne til på forhånd:</h3><ul><li>hvordan du kommer med forslag til aktiviteter</li><li>hvordan du uttrykker at du liker eller ikke liker noe</li><li>hvordan du uttrykker at det har oppstått en ny situasjon</li><li>Kinas geografi</li><li>klima og årstider i Kina</li></ul></section><section><div data-type="related-content"><ndlaembed data-article-id="3648" data-resource="related-content"></ndlaembed></div></section>';
const expected =
'<section><ndlaembed data-resource="<ndlaskip>image</ndlaskip>" data-resource_id="<ndlaskip>3003</ndlaskip>" data-alt="<ndlaskip>Jente sitter i bagasjerommet sammen med vesker og kofferter. Foto.</ndlaskip>" data-caption="Jente <ndlaskip><span lang=&quot;no&quot;>sitter</span></ndlaskip> i bagasjerommet sammen med vesker og kofferter. Foto." data-is-decorative="<ndlaskip>false</ndlaskip>" data-border="<ndlaskip>false</ndlaskip>" data-align data-size="<ndlaskip>full</ndlaskip>" data-hide-byline="<ndlaskip>true</ndlaskip>" data-url="<ndlaskip>https://api.test.ndla.no/image-api/v2/images/3003</ndlaskip>"></ndlaembed><h2><ndlaskip><span lang="zh">准备</span></ndlaskip> Zhǔnbèi</h2><h3>Dette bør du kjenne til på forhånd:</h3><ul><li>hvordan du kommer med forslag til aktiviteter</li><li>hvordan du uttrykker at du liker eller ikke liker noe</li><li>hvordan du uttrykker at det har oppstått en ny situasjon</li><li>Kinas geografi</li><li>klima og årstider i Kina</li></ul></section><section><div data-type="related-content"><ndlaembed data-article-id="<ndlaskip>3648</ndlaskip>" data-resource="<ndlaskip>related-content</ndlaskip>"></ndlaembed></div></section>';
expect(wrapDocument(document)).toBe(htmlify(expected));
});

test("unwrapDocument does the exact opposite as the other test", () => {
expect(unwrapDocument(htmlify("Hei"))).toBe("Hei");
expect(unwrapDocument(htmlify('Dette er en <ndlaskip><span lang="no">språkmerka tekst</span></ndlaskip>'))).toBe(
'Dette er en <span lang="no">språkmerka tekst</span>',
);
expect(unwrapDocument(htmlify('<ndlaembed data-attr="<ndlaskip>no</ndlaskip>"></ndlaembed>'))).toBe(
'<ndlaembed data-attr="no"></ndlaembed>',
);
expect(unwrapDocument(htmlify('<ndlaembed data-title="Her er en tittel som skal oversettes"></ndlaembed>'))).toBe(
'<ndlaembed data-title="Her er en tittel som skal oversettes"></ndlaembed>',
);
expect(
unwrapDocument(
htmlify(
'<ndlaembed data-title="Her er en tittel som skal oversettes" data-alt="<ndlaskip>Og en alt som ikke skal oversettes</ndlaskip>"></ndlaembed>',
),
),
).toBe(
'<ndlaembed data-title="Her er en tittel som skal oversettes" data-alt="Og en alt som ikke skal oversettes"></ndlaembed>',
);

const wrappedDocument =
'<section><ndlaembed data-resource="<ndlaskip>image</ndlaskip>" data-resource_id="<ndlaskip>3003</ndlaskip>" data-alt="<ndlaskip>Jente sitter i bagasjerommet sammen med vesker og kofferter. Foto.</ndlaskip>" data-caption="Jente <ndlaskip><span lang=&quot;no&quot;>sitter</span></ndlaskip> i bagasjerommet sammen med vesker og kofferter. Foto." data-is-decorative="<ndlaskip>false</ndlaskip>" data-border="<ndlaskip>false</ndlaskip>" data-align data-size="<ndlaskip>full</ndlaskip>" data-hide-byline="<ndlaskip>true</ndlaskip>" data-url="<ndlaskip>https://api.test.ndla.no/image-api/v2/images/3003</ndlaskip>"></ndlaembed><h2><ndlaskip><span lang="zh">准备</span></ndlaskip> Zhǔnbèi</h2><h3>Dette bør du kjenne til på forhånd:</h3><ul><li>hvordan du kommer med forslag til aktiviteter</li><li>hvordan du uttrykker at du liker eller ikke liker noe</li><li>hvordan du uttrykker at det har oppstått en ny situasjon</li><li>Kinas geografi</li><li>klima og årstider i Kina</li></ul></section><section><div data-type="related-content"><ndlaembed data-article-id="<ndlaskip>3648</ndlaskip>" data-resource="<ndlaskip>related-content</ndlaskip>"></ndlaembed></div></section>';
const expectedUnwrappedDocument =
'<section><ndlaembed data-resource="image" data-resource_id="3003" data-alt="Jente sitter i bagasjerommet sammen med vesker og kofferter. Foto." data-caption="Jente <span lang=&quot;no&quot;>sitter</span> i bagasjerommet sammen med vesker og kofferter. Foto." data-is-decorative="false" data-border="false" data-align="" data-size="full" data-hide-byline="true" data-url="https://api.test.ndla.no/image-api/v2/images/3003"></ndlaembed><h2><span lang="zh">准备</span> Zhǔnbèi</h2><h3>Dette bør du kjenne til på forhånd:</h3><ul><li>hvordan du kommer med forslag til aktiviteter</li><li>hvordan du uttrykker at du liker eller ikke liker noe</li><li>hvordan du uttrykker at det har oppstått en ny situasjon</li><li>Kinas geografi</li><li>klima og årstider i Kina</li></ul></section><section><div data-type="related-content"><ndlaembed data-article-id="3648" data-resource="related-content"></ndlaembed></div></section>';
expect(unwrapDocument(htmlify(wrappedDocument))).toBe(expectedUnwrappedDocument);
});
80 changes: 75 additions & 5 deletions src/server/translate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
*
*/

import { CheerioAPI, load } from "cheerio";
import { AnyNode } from "domhandler";
import FormData from "form-data";
import fetch from "node-fetch";
import queryString from "query-string";
Expand Down Expand Up @@ -41,7 +43,75 @@ const headers = user
}
: undefined;

const doFetch = (name: string, element: ApiTranslateType): Promise<ResponseType> => {
const wrapAttribute = (html: CheerioAPI, element: AnyNode, attribute: string, selector?: string) => {
const value = html(element).attr(attribute) ?? "";
if (!value) return;
const innerHtml = load(value);
if (!selector) {
innerHtml("body").wrapInner("<ndlaskip></ndlaskip>");
} else {
innerHtml(selector).each((_, el) => {
innerHtml(el).wrap("<ndlaskip></ndlaskip>");
});
}
html(element).attr(attribute, innerHtml("body").html());
};

const wrapDataAttributes = (html: CheerioAPI, element: AnyNode) => {
const translateAttributes = ["data-caption", "data-title", "data-subtitle", "data-description", "data-url-text"];
const attributes = html(element).attr() ?? {};
Object.keys(attributes).forEach((attr) => {
if (translateAttributes.includes(attr)) {
wrapAttribute(html, element, attr, "span[lang]");
} else {
wrapAttribute(html, element, attr);
}
});
};

export const wrapDocument = (document: string): string => {
const html = load(document);
// running text first
html("span[lang]").each((_, el) => {
html(el).wrap("<ndlaskip></ndlaskip>");
});
// math second
html("math").each((_, el) => {
html(el).wrap("<ndlaskip></ndlaskip>");
});
// all attributes in ndla-embed. Handles text-fields separately
html("ndlaembed").each((_, el) => {
wrapDataAttributes(html, el);
});

// Our backend uses Jsoup to encode html. However, nynodata expects it to be not encoded. As such, we have to parse
// the entire html string and reencode it.
return html.html({ xml: { xmlMode: false, decodeEntities: false } });
};

export const unwrapDataAttributes = (html: CheerioAPI, element: any) => {
const attributes = html(element).attr() ?? {};
Object.keys(attributes).forEach((attr) => {
const inner = load(element.attribs[attr]);
inner("ndlaskip").each((_, el) => {
inner(el).contents().unwrap();
});
html(element).attr(attr, inner("body").html());
});
};

export const unwrapDocument = (res: string): string => {
const html = load(res);
html("ndlaskip").each((_, el) => {
html(el).contents().unwrap();
});
html("ndlaembed").each((_, el) => {
unwrapDataAttributes(html, el);
});
return html("body").unwrap().html() ?? "";
};

const doFetch = async (name: string, element: ApiTranslateType): Promise<ResponseType> => {
if (element.type === "text") {
const parsedContent = element.isArray ? element.content.join("|") : element.content;
const params = {
Expand All @@ -60,8 +130,8 @@ const doFetch = (name: string, element: ApiTranslateType): Promise<ResponseType>
});
} else {
const formData = new FormData();
const wrappedContent = `<html>${element.content}</html>`;
const buffer = Buffer.from(wrappedContent);
const content = wrapDocument(`${element.content}`);
const buffer = Buffer.from(content);
const params = { stilmal };

formData.append("file", buffer, { filename: `${name}.html` });
Expand All @@ -73,8 +143,8 @@ const doFetch = (name: string, element: ApiTranslateType): Promise<ResponseType>
.then((res) => res.blob())
.then((res) => res.text())
.then(async (res) => {
const strippedResponse = res.replace("<html>", "").replace("</html>", "");
return { key: name, value: strippedResponse };
const unwrapped = unwrapDocument(res);
return { key: name, value: unwrapped };
});
}
};
Expand Down
Loading