Skip to content

Commit

Permalink
TIKA-4350 HTML snippet containing <iframe> as root element erroneousl…
Browse files Browse the repository at this point in the history
…y recognized as application/xml (#2045)

(cherry picked from commit bd878d3)
  • Loading branch information
sebastian-nagel authored and tballison committed Nov 14, 2024
1 parent 0301f81 commit 9d2fad2
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7680,6 +7680,8 @@
<root-XML localName="SCRIPT"/>
<root-XML localName="frameset"/>
<root-XML localName="FRAMESET"/>
<root-XML localName="iframe"/>
<root-XML localName="IFRAME"/>
<magic priority="60">
<match value="(?i)&lt;(html|head|body|title|div)[ >]" type="regex" offset="0"/>
<match value="(?i)&lt;h[123][ >]" type="regex" offset="0"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ public void testDetection() throws Exception {
testFile("text/html", "testlargerbuffer.html");
// test fragment of HTML with <div> (TIKA-1102)
testFile("text/html", "htmlfragment");
// test fragment of HTML with <iframe> and potentially misleading file suffix
testFile("text/html", "test-html-snippet-iframe.jsp");
// test binary CGM detection (TIKA-1170)
testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
// test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<!-- this is a comment: https://www.example.org/path/file.pdf -->
<iframe src='/path/file.pdf' width='100%' height='100%' target='_blank'>

0 comments on commit 9d2fad2

Please sign in to comment.