From 47293f233711a2f8c9338a4047ccc46f4417bc39 Mon Sep 17 00:00:00 2001 From: Rhys Berrow Date: Tue, 4 Jun 2024 11:53:20 +0100 Subject: [PATCH 1/7] Add validation for invalid html tags --- app/error_messages.py | 1 + app/validators/questionnaire_validator.py | 38 ++++++++ .../test_invalid_html_in_schema_text.json | 97 +++++++++++++++++++ .../test_introduction_with_guidance.json | 2 +- tests/test_questionnaire_validator.py | 22 +++++ 5 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 tests/schemas/invalid/test_invalid_html_in_schema_text.json diff --git a/app/error_messages.py b/app/error_messages.py index e33b6256b..96cb50be4 100644 --- a/app/error_messages.py +++ b/app/error_messages.py @@ -1,4 +1,5 @@ DUMB_QUOTES_FOUND = "Found dumb quotes(s) in schema text" +HTML_FOUND = "Found invalid HTML in schema text" INVALID_WHITESPACE_FOUND = "Found invalid white space(s) in schema text" DUPLICATE_ID_FOUND = "Duplicate id found" FOR_LIST_NEVER_POPULATED = "for_list is not populated by any ListCollector blocks or supplementary data sources" diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 1634c897a..f72d6a5e0 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -30,6 +30,7 @@ def validate(self): self.validate_duplicates() self.validate_smart_quotes() self.validate_white_spaces() + self.validate_html() for section in self.questionnaire_schema.sections: section_validator = SectionValidator(section, self.questionnaire_schema) @@ -107,6 +108,43 @@ def validate_smart_quotes(self): pointer=translatable_item.pointer, ) + def validate_html(self): + html_strings = [] + schema_object = SurveySchema(self.schema_element) + + # pylint: disable=invalid-string-quote + html_regex = re.compile(r"<[^>]*>") + + for translatable_item in schema_object.translatable_items: + schema_text = translatable_item.value + + values_to_check = [schema_text] + + if isinstance(schema_text, dict): + values_to_check = schema_text.values() + + for schema_text in values_to_check: + if schema_text and html_regex.search(schema_text): + html_strings.append( + {"pointer": translatable_item.pointer, "text": schema_text} + ) + + if html_strings: + self.check_invalid_html_tags(html_strings) + + def check_invalid_html_tags(self, html_strings): + strong = re.compile("]>*>|") + anchor = re.compile("]>*>|") + for html_string in html_strings: + if not strong.search(html_string["text"]) and not anchor.search( + html_string["text"] + ): + self.add_error( + error_messages.HTML_FOUND, + pointer=html_string["pointer"], + text=html_string["text"], + ) + def validate_white_spaces(self): schema_object = SurveySchema(self.schema_element) diff --git a/tests/schemas/invalid/test_invalid_html_in_schema_text.json b/tests/schemas/invalid/test_invalid_html_in_schema_text.json new file mode 100644 index 000000000..dc3f02bea --- /dev/null +++ b/tests/schemas/invalid/test_invalid_html_in_schema_text.json @@ -0,0 +1,97 @@ +{ + "mime_type": "application/json/ons/eq", + "language": "en", + "schema_version": "0.0.1", + "data_version": "0.0.3", + "survey_id": "144", + "theme": "default", + "title": "Test invalid html", + "legal_basis": "Notice is given under section 999 of the Test Act 2000", + "metadata": [ + { + "name": "user_id", + "type": "string" + }, + { + "name": "period_id", + "type": "string" + }, + { + "name": "ru_name", + "type": "string" + }, + { + "name": "ru_ref", + "type": "string" + }, + { + "name": "trad_as", + "type": "string", + "optional": true + } + ], + "questionnaire_flow": { + "type": "Linear", + "options": {} + }, + "sections": [ + { + "id": "introduction-section", + "title": "Introduction", + "groups": [ + { + "id": "introduction-group", + "title": "General Business Information", + "blocks": [ + { + "id": "introduction", + "type": "Introduction", + "primary_content": [ + { + "id": "business-details", + "title": "Introduction with valid and invalid HTML", + "contents": [ + { + "guidance": { + "contents": [ + { + "title": "Coronavirus (COVID-19) guidance", + "description": "Explain your figures in the comment section to minimise us contacting you and to help us tell an industry story" + } + ] + } + } + ] + } + ] + }, + { + "type": "Interstitial", + "id": "intersitital-one", + "content": { + "title": "Page with invalid html", + "contents": [ + { + "description": "

You have successfully completed this section

" + } + ] + } + }, + { + "type": "Interstitial", + "id": "intersitital-two", + "content": { + "title": "Page with link", + "contents": [ + { + "description": "Anchor" + } + ] + } + } + ] + } + ] + } + ] +} diff --git a/tests/schemas/valid/test_introduction_with_guidance.json b/tests/schemas/valid/test_introduction_with_guidance.json index bb2e21b57..a3bbab155 100644 --- a/tests/schemas/valid/test_introduction_with_guidance.json +++ b/tests/schemas/valid/test_introduction_with_guidance.json @@ -72,7 +72,7 @@ "title": "Section complete", "contents": [ { - "description": "

You have successfully completed this section

" + "description": "You have successfully completed this section" } ] } diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index fec1ee0d5..4cde74daf 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -336,6 +336,28 @@ def test_invalid_whitespaces_in_schema(): assert validator.errors == expected_error_messages +def test_invalid_html_in_schema(): + filename = "schemas/invalid/test_invalid_html_in_schema_text.json" + + validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) + + expected_error_messages = [ + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/1/content/contents/0/description", + "text": "

You have successfully completed this section

", + }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", + "text": "Coronavirus (COVID-19) guidance", + }, + ] + validator.validate_html() + + assert validator.errors == expected_error_messages + + def test_invalid_answer_type_for_question_summary_concatenation(): filename = "schemas/invalid/test_invalid_answer_type_for_question_summary.json" From e57eb81cb9a74cc3ef32460ae08f85d4d03df167 Mon Sep 17 00:00:00 2001 From: Rhys Berrow Date: Tue, 11 Jun 2024 11:32:25 +0100 Subject: [PATCH 2/7] PR comments --- app/validators/questionnaire_validator.py | 26 +++++++++-------- .../test_invalid_html_in_schema_text.json | 28 +++++++++++++++++-- tests/test_questionnaire_validator.py | 15 ++++++++++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index f72d6a5e0..dcd7416d6 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -123,22 +123,26 @@ def validate_html(self): if isinstance(schema_text, dict): values_to_check = schema_text.values() - for schema_text in values_to_check: - if schema_text and html_regex.search(schema_text): - html_strings.append( - {"pointer": translatable_item.pointer, "text": schema_text} - ) - + html_strings.extend( + {"pointer": translatable_item.pointer, "text": schema_text} + for schema_text in values_to_check + if schema_text and html_regex.search(schema_text) + ) if html_strings: self.check_invalid_html_tags(html_strings) def check_invalid_html_tags(self, html_strings): - strong = re.compile("]>*>|") - anchor = re.compile("]>*>|") + strong = re.compile(r"(?:(?!<\/strong>).)*") + anchor = re.compile(".*") + all_tags = re.compile( + "<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" + ) + for html_string in html_strings: - if not strong.search(html_string["text"]) and not anchor.search( - html_string["text"] - ): + if len(strong.findall(html_string["text"])) + len( + anchor.findall(html_string["text"]) + ) != len(all_tags.findall(html_string["text"])): + self.add_error( error_messages.HTML_FOUND, pointer=html_string["pointer"], diff --git a/tests/schemas/invalid/test_invalid_html_in_schema_text.json b/tests/schemas/invalid/test_invalid_html_in_schema_text.json index dc3f02bea..2246c6ee6 100644 --- a/tests/schemas/invalid/test_invalid_html_in_schema_text.json +++ b/tests/schemas/invalid/test_invalid_html_in_schema_text.json @@ -79,12 +79,36 @@ }, { "type": "Interstitial", - "id": "intersitital-two", + "id": "interstitial-two", "content": { "title": "Page with link", "contents": [ { - "description": "Anchor" + "description": "Anchor" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-three", + "content": { + "title": "Page with mixed invalid tags", + "contents": [ + { + "description": "

Title

Not valid tag" + } + ] + } + }, + { + "type": "Interstitial", + "id": "interstitial-four", + "content": { + "title": "Valid double strong with another strong.", + "contents": [ + { + "description": "TitleNot valid tag" } ] } diff --git a/tests/test_questionnaire_validator.py b/tests/test_questionnaire_validator.py index 4cde74daf..bf6f14314 100644 --- a/tests/test_questionnaire_validator.py +++ b/tests/test_questionnaire_validator.py @@ -342,11 +342,26 @@ def test_invalid_html_in_schema(): validator = QuestionnaireValidator(_open_and_load_schema_file(filename)) expected_error_messages = [ + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/3/content/title", + "text": "Page with mixed invalid tags", + }, { "message": error_messages.HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/1/content/contents/0/description", "text": "

You have successfully completed this section

", }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/3/content/contents/0/description", + "text": "

Title

Not valid tag", + }, + { + "message": error_messages.HTML_FOUND, + "pointer": "/sections/0/groups/0/blocks/4/content/contents/0/description", + "text": "TitleNot valid tag", + }, { "message": error_messages.HTML_FOUND, "pointer": "/sections/0/groups/0/blocks/0/primary_content/0/contents/0/guidance/contents/0/title", From e7c136f7ac107f406fd3330b62436897cd2713ed Mon Sep 17 00:00:00 2001 From: Rhys Berrow Date: Tue, 11 Jun 2024 11:58:10 +0100 Subject: [PATCH 3/7] Lint --- app/validators/questionnaire_validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index dcd7416d6..5e60a8310 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -132,10 +132,10 @@ def validate_html(self): self.check_invalid_html_tags(html_strings) def check_invalid_html_tags(self, html_strings): - strong = re.compile(r"(?:(?!<\/strong>).)*") - anchor = re.compile(".*") + strong = re.compile(r"(?:(?!).)*
") + anchor = re.compile("(?:(?!") all_tags = re.compile( - "<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" + r"<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" ) for html_string in html_strings: From 51deeeb2645f527730d95d0e54592dbc56fb7b1f Mon Sep 17 00:00:00 2001 From: Rhys Berrow Date: Wed, 12 Jun 2024 07:08:48 +0100 Subject: [PATCH 4/7] Raw string --- app/validators/questionnaire_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 5e60a8310..c841ad7b7 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -133,7 +133,7 @@ def validate_html(self): def check_invalid_html_tags(self, html_strings): strong = re.compile(r"(?:(?!).)*
") - anchor = re.compile("(?:(?!") + anchor = re.compile(r"(?:(?!") all_tags = re.compile( r"<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" ) From a5e64e2f510e83abef64432ce375c64ecc9c6e06 Mon Sep 17 00:00:00 2001 From: Rhys Berrow Date: Wed, 12 Jun 2024 10:33:02 +0100 Subject: [PATCH 5/7] Double anchor --- app/validators/questionnaire_validator.py | 2 +- .../invalid/test_invalid_html_in_schema_text.json | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index c841ad7b7..098fdaa1b 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -133,7 +133,7 @@ def validate_html(self): def check_invalid_html_tags(self, html_strings): strong = re.compile(r"(?:(?!).)*") - anchor = re.compile(r"(?:(?!") + anchor = re.compile(r"]*>.*?") all_tags = re.compile( r"<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" ) diff --git a/tests/schemas/invalid/test_invalid_html_in_schema_text.json b/tests/schemas/invalid/test_invalid_html_in_schema_text.json index 2246c6ee6..3e387f324 100644 --- a/tests/schemas/invalid/test_invalid_html_in_schema_text.json +++ b/tests/schemas/invalid/test_invalid_html_in_schema_text.json @@ -112,6 +112,18 @@ } ] } + }, + { + "type": "Interstitial", + "id": "interstitial-five", + "content": { + "title": "Valid double anchor.", + "contents": [ + { + "description": "Title and Not valid tag" + } + ] + } } ] } From 3e36e0758c1e6f3f443e47289974160ca47a1ded Mon Sep 17 00:00:00 2001 From: Rhys Berrow <47635349+berroar@users.noreply.github.com> Date: Thu, 13 Jun 2024 08:39:44 +0100 Subject: [PATCH 6/7] Update app/validators/questionnaire_validator.py Co-authored-by: petechd <53475968+petechd@users.noreply.github.com> --- app/validators/questionnaire_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index 098fdaa1b..ed2b54315 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -133,7 +133,7 @@ def validate_html(self): def check_invalid_html_tags(self, html_strings): strong = re.compile(r"(?:(?!).)*
") - anchor = re.compile(r"]*>.*?") + anchor = re.compile(r"]*>.*?") all_tags = re.compile( r"<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" ) From 6ba6d50c8ae052064cb51993fef4a1bf34fc3ca4 Mon Sep 17 00:00:00 2001 From: Rhys Berrow Date: Tue, 18 Jun 2024 09:01:21 +0100 Subject: [PATCH 7/7] Revert href change --- app/validators/questionnaire_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/validators/questionnaire_validator.py b/app/validators/questionnaire_validator.py index ed2b54315..098fdaa1b 100644 --- a/app/validators/questionnaire_validator.py +++ b/app/validators/questionnaire_validator.py @@ -133,7 +133,7 @@ def validate_html(self): def check_invalid_html_tags(self, html_strings): strong = re.compile(r"(?:(?!).)*") - anchor = re.compile(r"]*>.*?") + anchor = re.compile(r"]*>.*?") all_tags = re.compile( r"<([a-z0-9]+)(?=[\s>])(?:[^>=]|='[^']*'|=\"[^\"]*\"|=[^'\"\s]*)*\s?/?>" )