Skip to content

Commit

Permalink
Tika-4338 -- remove tagsoup entirely (#2011)
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Oct 25, 2024
1 parent 66ec0fb commit 2bb4624
Show file tree
Hide file tree
Showing 13 changed files with 279 additions and 86 deletions.
10 changes: 0 additions & 10 deletions tika-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,6 @@
<artifactId>tika-parser-sqlite3-package</artifactId>
<version>3.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-tagsoup-package</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>

<!-- Tika parsers modules (extended package) -->
<dependency>
Expand All @@ -274,11 +269,6 @@
<artifactId>tika-parser-sqlite3-module</artifactId>
<version>3.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-tagsoup-module</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>
<!-- Tika parsers modules (ML package) -->
<dependency>
<groupId>org.apache.tika</groupId>
Expand Down
2 changes: 1 addition & 1 deletion tika-bundles/tika-bundle-standard/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@
jackcess|
jackcess-encrypt|
commons-lang3|
tagsoup|
jsoup|
asm|
juniversalchardet|
vorbis-java-core|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ public void endElement(String uri, String localName, String name) throws SAXExce
if (matcher.matchesElement()) {
super.endElement(uri, localName, name);
}
// Sometimes tagsoup returns double end tags, so the stack might
// be empty! TODO: Remove this when the tagsoup problem is fixed.
// this was originally added for tagsoup, but we need it generally
if (!matchers.isEmpty()) {
matcher = matchers.removeFirst();
}
Expand Down
2 changes: 1 addition & 1 deletion tika-eval/tika-eval-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
<exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
<exclude>org.jsoup:jsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-databind:jar:</exclude>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ public void testBadTags() throws Exception {
List<Map<Cols, String>> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B);
assertEquals(1, tableInfosB.size());
Map<Cols, String> tableInfoB = tableInfosB.get(0);
//there actually is a tag problem, but tagsoup fixes it.
//there actually is a tag problem, but jsoup fixes it.
//this confirms behavior.
assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION));
}
Expand Down
4 changes: 2 additions & 2 deletions tika-eval/tika-eval-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,24 @@
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.xml.XMLConstants;

import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
Expand All @@ -53,11 +63,106 @@ public static ContentTags parseHTML(String html, Set<String> uppercaseTagsOfInte
Map<String, Integer> tags = new HashMap<>();
XHTMLContentTagHandler xhtmlContentTagHandler =
new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
SAXParserImpl.newInstance(null)
.parse(new InputSource(new StringReader(html)), xhtmlContentTagHandler);
Document document = Jsoup.parse(html);
NodeTraversor.filter(new TikaNodeFilter(xhtmlContentTagHandler), document);

return new ContentTags(xhtmlContentTagHandler.toString(), tags);
}

private static class TikaNodeFilter implements NodeFilter {
boolean ignore = true;
ContentHandler handler;

private TikaNodeFilter(ContentHandler handler) {
this.handler = handler;
}

@Override
public NodeFilter.FilterResult head(Node node, int i) {
//skip document fragment
if ("html".equals(node.nodeName())) {
ignore = false;
}
if (ignore) {
return FilterResult.CONTINUE;
}
if (node instanceof TextNode) {
String txt = ((TextNode) node).getWholeText();
if (txt != null) {
char[] chars = txt.toCharArray();
try {
if (chars.length > 0) {
handler.characters(chars, 0, chars.length);
}
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
return NodeFilter.FilterResult.CONTINUE;
} else if (node instanceof DataNode) {
//maybe handle script data directly here instead of
//passing it through to the HTMLHandler?
String txt = ((DataNode) node).getWholeData();
if (txt != null) {
char[] chars = txt.toCharArray();
try {
if (chars.length > 0) {
handler.characters(chars, 0, chars.length);
}
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
return NodeFilter.FilterResult.CONTINUE;
}
AttributesImpl attributes = new AttributesImpl();
Iterator<Attribute> jsoupAttrs = node
.attributes()
.iterator();
while (jsoupAttrs.hasNext()) {
Attribute jsoupAttr = jsoupAttrs.next();
attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue());
}
try {
handler.startElement("", node.nodeName(), node.nodeName(), attributes);
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
return NodeFilter.FilterResult.CONTINUE;
}

@Override
public NodeFilter.FilterResult tail(Node node, int i) {
if ("html".equals(node.nodeName())) {
ignore = true;
}
if (ignore) {
return FilterResult.CONTINUE;
}
if (node instanceof TextNode || node instanceof DataNode) {
return NodeFilter.FilterResult.CONTINUE;
}

try {
handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName());
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
return NodeFilter.FilterResult.CONTINUE;
}
}

private static class RuntimeSAXException extends RuntimeException {
private SAXException wrapped;

private RuntimeSAXException(SAXException e) {
this.wrapped = e;
}

SAXException getWrapped() {
return wrapped;
}
}

private static class XHTMLContentTagHandler extends ToTextContentHandler {
//Used to have a stack to make sure that starting/ending tags were matched
Expand Down
6 changes: 0 additions & 6 deletions tika-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,6 @@
<spring.version>5.3.39</spring.version>
<sqlite.version>3.47.0.0</sqlite.version>
<stax.ex.version>2.1.0</stax.ex.version>
<tagsoup.version>1.2.1</tagsoup.version>
<testcontainers.version>1.20.3</testcontainers.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent -->
<tukaani.version>1.10</tukaani.version>
Expand Down Expand Up @@ -910,11 +909,6 @@
<artifactId>bcprov-jdk18on</artifactId>
<version>${bouncycastle.version}</version>
</dependency>
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<version>${tagsoup.version}</version>
</dependency>
<dependency>
<groupId>org.freemarker</groupId>
<artifactId>freemarker</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
<version>${jhighlight.version}</version>
</dependency>
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>
<dependency>
<groupId>org.ow2.asm</groupId>
Expand Down
Loading

0 comments on commit 2bb4624

Please sign in to comment.