diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index a61b77bd32..bf5d998064 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -257,11 +257,6 @@
tika-parser-sqlite3-package
3.0.1-SNAPSHOT
-
- org.apache.tika
- tika-parser-tagsoup-package
- 3.0.0-SNAPSHOT
-
@@ -274,11 +269,6 @@
tika-parser-sqlite3-module
3.0.1-SNAPSHOT
-
- org.apache.tika
- tika-parser-tagsoup-module
- 3.0.0-SNAPSHOT
-
org.apache.tika
diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index 1b0ea87f8b..851cac644c 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -174,7 +174,7 @@
jackcess|
jackcess-encrypt|
commons-lang3|
- tagsoup|
+ jsoup|
asm|
juniversalchardet|
vorbis-java-core|
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
index 9f96186aab..831611c063 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
@@ -69,8 +69,7 @@ public void endElement(String uri, String localName, String name) throws SAXExce
if (matcher.matchesElement()) {
super.endElement(uri, localName, name);
}
- // Sometimes tagsoup returns double end tags, so the stack might
- // be empty! TODO: Remove this when the tagsoup problem is fixed.
+ // this was originally added for tagsoup, but we need it generally
if (!matchers.isEmpty()) {
matcher = matchers.removeFirst();
}
diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 7b80203f84..3d75fea31f 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -101,7 +101,7 @@
org.apache.lucene:lucene-core:jar:
org.apache.lucene:lucene-analysis-common:jar:
org.apache.lucene:lucene-analysis-icu:jar:
- org.ccil.cowan.tagsoup:tagsoup:jar:
+ org.jsoup:jsoup:jar:
com.ibm.icu:icu4j:jar:
com.fasterxml.jackson.core:jackson-core:jar:
com.fasterxml.jackson.core:jackson-databind:jar:
diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
index 035869c9a9..6b900bab31 100644
--- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
+++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
@@ -279,7 +279,7 @@ public void testBadTags() throws Exception {
List
- org.ccil.cowan.tagsoup
- tagsoup
+ org.jsoup
+ jsoup
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
index 1e3511d633..c3eecc252c 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java
@@ -20,14 +20,24 @@
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
+import javax.xml.XMLConstants;
-import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeFilter;
+import org.jsoup.select.NodeTraversor;
import org.xml.sax.Attributes;
-import org.xml.sax.InputSource;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
@@ -53,11 +63,106 @@ public static ContentTags parseHTML(String html, Set uppercaseTagsOfInte
Map tags = new HashMap<>();
XHTMLContentTagHandler xhtmlContentTagHandler =
new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags);
- SAXParserImpl.newInstance(null)
- .parse(new InputSource(new StringReader(html)), xhtmlContentTagHandler);
+ Document document = Jsoup.parse(html);
+ NodeTraversor.filter(new TikaNodeFilter(xhtmlContentTagHandler), document);
+
return new ContentTags(xhtmlContentTagHandler.toString(), tags);
}
+ private static class TikaNodeFilter implements NodeFilter {
+ boolean ignore = true;
+ ContentHandler handler;
+
+ private TikaNodeFilter(ContentHandler handler) {
+ this.handler = handler;
+ }
+
+ @Override
+ public NodeFilter.FilterResult head(Node node, int i) {
+ //skip document fragment
+ if ("html".equals(node.nodeName())) {
+ ignore = false;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode) {
+ String txt = ((TextNode) node).getWholeText();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ } else if (node instanceof DataNode) {
+ //maybe handle script data directly here instead of
+ //passing it through to the HTMLHandler?
+ String txt = ((DataNode) node).getWholeData();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ Iterator jsoupAttrs = node
+ .attributes()
+ .iterator();
+ while (jsoupAttrs.hasNext()) {
+ Attribute jsoupAttr = jsoupAttrs.next();
+ attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue());
+ }
+ try {
+ handler.startElement("", node.nodeName(), node.nodeName(), attributes);
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ @Override
+ public NodeFilter.FilterResult tail(Node node, int i) {
+ if ("html".equals(node.nodeName())) {
+ ignore = true;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode || node instanceof DataNode) {
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ try {
+ handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName());
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ }
+
+ private static class RuntimeSAXException extends RuntimeException {
+ private SAXException wrapped;
+
+ private RuntimeSAXException(SAXException e) {
+ this.wrapped = e;
+ }
+
+ SAXException getWrapped() {
+ return wrapped;
+ }
+ }
private static class XHTMLContentTagHandler extends ToTextContentHandler {
//Used to have a stack to make sure that starting/ending tags were matched
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6b589591b0..529654e04e 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -435,7 +435,6 @@
5.3.39
3.47.0.0
2.1.0
- 1.2.1
1.20.3
1.10
@@ -910,11 +909,6 @@
bcprov-jdk18on
${bouncycastle.version}
-
- org.ccil.cowan.tagsoup
- tagsoup
- ${tagsoup.version}
-
org.freemarker
freemarker
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
index 7fefaa7c53..bdd44b4438 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
@@ -41,8 +41,8 @@
${jhighlight.version}
- org.ccil.cowan.tagsoup
- tagsoup
+ org.jsoup
+ jsoup
org.ow2.asm
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index a3d2a4b48e..c11f20d368 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -22,22 +22,29 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.xml.XMLConstants;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
import org.codelibs.jhighlight.renderer.Renderer;
import org.codelibs.jhighlight.renderer.XhtmlRendererFactory;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeFilter;
+import org.jsoup.select.NodeTraversor;
import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
@@ -47,6 +54,7 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* Generic Source code parser for Java, Groovy, C++.
@@ -61,19 +69,15 @@ public class SourceCodeParser extends AbstractEncodingDetectorParser {
private static final Pattern AUTHORPATTERN = Pattern.compile("(?im)@author (.*) *$");
- private static final Map TYPES_TO_RENDERER =
- new HashMap() {
- private static final long serialVersionUID = -741976157563751152L;
+ private static final Map TYPES_TO_RENDERER = new HashMap() {
+ private static final long serialVersionUID = -741976157563751152L;
- {
- put(MediaType.text("x-c++src"), CPP);
- put(MediaType.text("x-java-source"), JAVA);
- put(MediaType.text("x-groovy"), GROOVY);
- }
- };
-
- //Parse the HTML document
- private static final Schema HTML_SCHEMA = new HTMLSchema();
+ {
+ put(MediaType.text("x-c++src"), CPP);
+ put(MediaType.text("x-java-source"), JAVA);
+ put(MediaType.text("x-groovy"), GROOVY);
+ }
+ };
public SourceCodeParser() {
super();
@@ -89,50 +93,57 @@ public Set getSupportedTypes(ParseContext context) {
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(stream),
metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (mediaType != null && name != null) {
- MediaType type = MediaType.parse(mediaType);
+ MediaType type = null;
+ if (mediaType != null) {
+ type = MediaType.parse(mediaType);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- StringBuilder out = new StringBuilder();
- String line;
- int nbLines = 0;
- while ((line = reader.readLine()) != null) {
- out.append(line).append(System.getProperty("line.separator"));
- String author = parserAuthor(line);
- if (author != null) {
- metadata.add(TikaCoreProperties.CREATOR, author);
- }
- nbLines++;
+ } else {
+ throw new TikaException("media type must be set in metadata before parse");
+ }
+ StringBuilder out = new StringBuilder();
+ String line;
+ int nbLines = 0;
+ while ((line = reader.readLine()) != null) {
+ out
+ .append(line)
+ .append(System.getProperty("line.separator"));
+ String author = parserAuthor(line);
+ if (author != null) {
+ metadata.add(TikaCoreProperties.CREATOR, author);
}
- metadata.set("LoC", String.valueOf(nbLines));
- Renderer renderer = getRenderer(type.toString());
-
- String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
-
- Schema schema = context.get(Schema.class, HTML_SCHEMA);
+ nbLines++;
+ }
+ metadata.set("LoC", String.valueOf(nbLines));
+ Renderer renderer = getRenderer(type.toString());
- org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
- parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
- parser.setContentHandler(handler);
- parser.parse(new InputSource(new StringReader(codeAsHtml)));
+ String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
+ Document document = Jsoup.parse(codeAsHtml);
+ document.quirksMode(Document.QuirksMode.quirks);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ try {
+ NodeTraversor.filter(new TikaNodeFilter(xhtml), document);
+ } catch (RuntimeSAXException e) {
+ throw e.getWrapped();
+ } finally {
+ xhtml.endDocument();
}
}
-
}
- private Renderer getRenderer(String mimeType) {
+ private Renderer getRenderer(String mimeType) throws TikaException {
MediaType mt = MediaType.parse(mimeType);
String type = TYPES_TO_RENDERER.get(mt);
if (type == null) {
- throw new RuntimeException("unparseable content type " + mimeType);
+ throw new TikaException("unparseable content type " + mimeType);
}
return XhtmlRendererFactory.getRenderer(type);
}
@@ -141,9 +152,106 @@ private Renderer getRenderer(String mimeType) {
private String parserAuthor(String line) {
Matcher m = AUTHORPATTERN.matcher(line);
if (m.find()) {
- return m.group(1).trim();
+ return m
+ .group(1)
+ .trim();
}
return null;
}
+
+ private static class TikaNodeFilter implements NodeFilter {
+ boolean ignore = true;
+ ContentHandler handler;
+
+ private TikaNodeFilter(ContentHandler handler) {
+ this.handler = handler;
+ }
+
+ @Override
+ public NodeFilter.FilterResult head(Node node, int i) {
+ //skip document fragment
+ if ("html".equals(node.nodeName())) {
+ ignore = false;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode) {
+ String txt = ((TextNode) node).getWholeText();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ } else if (node instanceof DataNode) {
+ //maybe handle script data directly here instead of
+ //passing it through to the HTMLHandler?
+ String txt = ((DataNode) node).getWholeData();
+ if (txt != null) {
+ char[] chars = txt.toCharArray();
+ try {
+ if (chars.length > 0) {
+ handler.characters(chars, 0, chars.length);
+ }
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ Iterator jsoupAttrs = node
+ .attributes()
+ .iterator();
+ while (jsoupAttrs.hasNext()) {
+ Attribute jsoupAttr = jsoupAttrs.next();
+ attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue());
+ }
+ try {
+ handler.startElement("", node.nodeName(), node.nodeName(), attributes);
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ @Override
+ public NodeFilter.FilterResult tail(Node node, int i) {
+ if ("html".equals(node.nodeName())) {
+ ignore = true;
+ }
+ if (ignore) {
+ return FilterResult.CONTINUE;
+ }
+ if (node instanceof TextNode || node instanceof DataNode) {
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+
+ try {
+ handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName());
+ } catch (SAXException e) {
+ throw new RuntimeSAXException(e);
+ }
+ return NodeFilter.FilterResult.CONTINUE;
+ }
+ }
+
+ private static class RuntimeSAXException extends RuntimeException {
+ private SAXException wrapped;
+
+ private RuntimeSAXException(SAXException e) {
+ this.wrapped = e;
+ }
+
+ SAXException getWrapped() {
+ return wrapped;
+ }
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index 45758f89bf..e932c066df 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -56,12 +56,12 @@ public void testHTMLRenderWithReturnLine() throws Exception {
getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser,
createMetadata("text/x-java-source")).xml;
- assertTrue(htmlContent.indexOf("public") >
+ "public") >
0);
- assertTrue(htmlContent.indexOf("static") > 0);
- assertTrue(htmlContent.indexOf("") > 0);
+ assertTrue(htmlContent.indexOf("static") > 0);
}
@Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
index c60d133b7b..0255a91618 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
@@ -93,16 +93,13 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml,
- ParseContext context) throws TikaException, SAXException { // throws IOException
+ ParseContext context) throws TikaException, IOException, SAXException { // throws IOException
InputStream stream = null;
Metadata metadata = new Metadata();
ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
- try {
- stream = UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get();
- htmlParser.parse(stream, handler, metadata, context);
- } catch (IOException e) {
- // Pushback overflow from tagsoup
- }
+ stream = UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get();
+ htmlParser.parse(stream, handler, metadata, context);
+
}
}
diff --git a/tika-server/tika-server-eval/pom.xml b/tika-server/tika-server-eval/pom.xml
index 4e7275f3e3..52f8f594e0 100644
--- a/tika-server/tika-server-eval/pom.xml
+++ b/tika-server/tika-server-eval/pom.xml
@@ -69,7 +69,7 @@
org.apache.lucene:lucene-core:jar:
org.apache.lucene:lucene-analysis-common:jar:
org.apache.lucene:lucene-analysis-icu:jar:
- org.ccil.cowan.tagsoup:tagsoup:jar:
+ org.jsoup:jar:
com.ibm.icu:icu4j:jar:
com.fasterxml.jackson.core:jackson-core:jar:
com.fasterxml.jackson.core:jackson-databind:jar: