diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml index a61b77bd32..bf5d998064 100644 --- a/tika-bom/pom.xml +++ b/tika-bom/pom.xml @@ -257,11 +257,6 @@ tika-parser-sqlite3-package 3.0.1-SNAPSHOT - - org.apache.tika - tika-parser-tagsoup-package - 3.0.0-SNAPSHOT - @@ -274,11 +269,6 @@ tika-parser-sqlite3-module 3.0.1-SNAPSHOT - - org.apache.tika - tika-parser-tagsoup-module - 3.0.0-SNAPSHOT - org.apache.tika diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml index 1b0ea87f8b..851cac644c 100644 --- a/tika-bundles/tika-bundle-standard/pom.xml +++ b/tika-bundles/tika-bundle-standard/pom.xml @@ -174,7 +174,7 @@ jackcess| jackcess-encrypt| commons-lang3| - tagsoup| + jsoup| asm| juniversalchardet| vorbis-java-core| diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java index 9f96186aab..831611c063 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java @@ -69,8 +69,7 @@ public void endElement(String uri, String localName, String name) throws SAXExce if (matcher.matchesElement()) { super.endElement(uri, localName, name); } - // Sometimes tagsoup returns double end tags, so the stack might - // be empty! TODO: Remove this when the tagsoup problem is fixed. + // this was originally added for tagsoup, but we need it generally if (!matchers.isEmpty()) { matcher = matchers.removeFirst(); } diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml index 7b80203f84..3d75fea31f 100644 --- a/tika-eval/tika-eval-app/pom.xml +++ b/tika-eval/tika-eval-app/pom.xml @@ -101,7 +101,7 @@ org.apache.lucene:lucene-core:jar: org.apache.lucene:lucene-analysis-common:jar: org.apache.lucene:lucene-analysis-icu:jar: - org.ccil.cowan.tagsoup:tagsoup:jar: + org.jsoup:jsoup:jar: com.ibm.icu:icu4j:jar: com.fasterxml.jackson.core:jackson-core:jar: com.fasterxml.jackson.core:jackson-databind:jar: diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java index 035869c9a9..6b900bab31 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java @@ -279,7 +279,7 @@ public void testBadTags() throws Exception { List> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B); assertEquals(1, tableInfosB.size()); Map tableInfoB = tableInfosB.get(0); - //there actually is a tag problem, but tagsoup fixes it. + //there actually is a tag problem, but jsoup fixes it. //this confirms behavior. assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION)); } diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml index 252eedc2e8..60af7af1f7 100644 --- a/tika-eval/tika-eval-core/pom.xml +++ b/tika-eval/tika-eval-core/pom.xml @@ -75,8 +75,8 @@ commons-lang3 - org.ccil.cowan.tagsoup - tagsoup + org.jsoup + jsoup diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java index 1e3511d633..c3eecc252c 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java @@ -20,14 +20,24 @@ import java.io.IOException; import java.io.StringReader; import java.util.HashMap; +import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Set; +import javax.xml.XMLConstants; -import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.DataNode; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeFilter; +import org.jsoup.select.NodeTraversor; import org.xml.sax.Attributes; -import org.xml.sax.InputSource; +import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParseContext; @@ -53,11 +63,106 @@ public static ContentTags parseHTML(String html, Set uppercaseTagsOfInte Map tags = new HashMap<>(); XHTMLContentTagHandler xhtmlContentTagHandler = new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); - SAXParserImpl.newInstance(null) - .parse(new InputSource(new StringReader(html)), xhtmlContentTagHandler); + Document document = Jsoup.parse(html); + NodeTraversor.filter(new TikaNodeFilter(xhtmlContentTagHandler), document); + return new ContentTags(xhtmlContentTagHandler.toString(), tags); } + private static class TikaNodeFilter implements NodeFilter { + boolean ignore = true; + ContentHandler handler; + + private TikaNodeFilter(ContentHandler handler) { + this.handler = handler; + } + + @Override + public NodeFilter.FilterResult head(Node node, int i) { + //skip document fragment + if ("html".equals(node.nodeName())) { + ignore = false; + } + if (ignore) { + return FilterResult.CONTINUE; + } + if (node instanceof TextNode) { + String txt = ((TextNode) node).getWholeText(); + if (txt != null) { + char[] chars = txt.toCharArray(); + try { + if (chars.length > 0) { + handler.characters(chars, 0, chars.length); + } + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + } + return NodeFilter.FilterResult.CONTINUE; + } else if (node instanceof DataNode) { + //maybe handle script data directly here instead of + //passing it through to the HTMLHandler? + String txt = ((DataNode) node).getWholeData(); + if (txt != null) { + char[] chars = txt.toCharArray(); + try { + if (chars.length > 0) { + handler.characters(chars, 0, chars.length); + } + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + } + return NodeFilter.FilterResult.CONTINUE; + } + AttributesImpl attributes = new AttributesImpl(); + Iterator jsoupAttrs = node + .attributes() + .iterator(); + while (jsoupAttrs.hasNext()) { + Attribute jsoupAttr = jsoupAttrs.next(); + attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue()); + } + try { + handler.startElement("", node.nodeName(), node.nodeName(), attributes); + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + return NodeFilter.FilterResult.CONTINUE; + } + + @Override + public NodeFilter.FilterResult tail(Node node, int i) { + if ("html".equals(node.nodeName())) { + ignore = true; + } + if (ignore) { + return FilterResult.CONTINUE; + } + if (node instanceof TextNode || node instanceof DataNode) { + return NodeFilter.FilterResult.CONTINUE; + } + + try { + handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName()); + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + return NodeFilter.FilterResult.CONTINUE; + } + } + + private static class RuntimeSAXException extends RuntimeException { + private SAXException wrapped; + + private RuntimeSAXException(SAXException e) { + this.wrapped = e; + } + + SAXException getWrapped() { + return wrapped; + } + } private static class XHTMLContentTagHandler extends ToTextContentHandler { //Used to have a stack to make sure that starting/ending tags were matched diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 6b589591b0..529654e04e 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -435,7 +435,6 @@ 5.3.39 3.47.0.0 2.1.0 - 1.2.1 1.20.3 1.10 @@ -910,11 +909,6 @@ bcprov-jdk18on ${bouncycastle.version} - - org.ccil.cowan.tagsoup - tagsoup - ${tagsoup.version} - org.freemarker freemarker diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml index 7fefaa7c53..bdd44b4438 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml @@ -41,8 +41,8 @@ ${jhighlight.version} - org.ccil.cowan.tagsoup - tagsoup + org.jsoup + jsoup org.ow2.asm diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java index a3d2a4b48e..c11f20d368 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java @@ -22,22 +22,29 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringReader; import java.nio.charset.Charset; import java.util.HashMap; +import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import javax.xml.XMLConstants; import org.apache.commons.io.input.CloseShieldInputStream; -import org.ccil.cowan.tagsoup.HTMLSchema; -import org.ccil.cowan.tagsoup.Schema; import org.codelibs.jhighlight.renderer.Renderer; import org.codelibs.jhighlight.renderer.XhtmlRendererFactory; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.DataNode; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeFilter; +import org.jsoup.select.NodeTraversor; import org.xml.sax.ContentHandler; -import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.detect.EncodingDetector; @@ -47,6 +54,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; /** * Generic Source code parser for Java, Groovy, C++. @@ -61,19 +69,15 @@ public class SourceCodeParser extends AbstractEncodingDetectorParser { private static final Pattern AUTHORPATTERN = Pattern.compile("(?im)@author (.*) *$"); - private static final Map TYPES_TO_RENDERER = - new HashMap() { - private static final long serialVersionUID = -741976157563751152L; + private static final Map TYPES_TO_RENDERER = new HashMap() { + private static final long serialVersionUID = -741976157563751152L; - { - put(MediaType.text("x-c++src"), CPP); - put(MediaType.text("x-java-source"), JAVA); - put(MediaType.text("x-groovy"), GROOVY); - } - }; - - //Parse the HTML document - private static final Schema HTML_SCHEMA = new HTMLSchema(); + { + put(MediaType.text("x-c++src"), CPP); + put(MediaType.text("x-java-source"), JAVA); + put(MediaType.text("x-groovy"), GROOVY); + } + }; public SourceCodeParser() { super(); @@ -89,50 +93,57 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(stream), metadata, getEncodingDetector(context))) { Charset charset = reader.getCharset(); String mediaType = metadata.get(Metadata.CONTENT_TYPE); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - if (mediaType != null && name != null) { - MediaType type = MediaType.parse(mediaType); + MediaType type = null; + if (mediaType != null) { + type = MediaType.parse(mediaType); metadata.set(Metadata.CONTENT_TYPE, type.toString()); metadata.set(Metadata.CONTENT_ENCODING, charset.name()); - - StringBuilder out = new StringBuilder(); - String line; - int nbLines = 0; - while ((line = reader.readLine()) != null) { - out.append(line).append(System.getProperty("line.separator")); - String author = parserAuthor(line); - if (author != null) { - metadata.add(TikaCoreProperties.CREATOR, author); - } - nbLines++; + } else { + throw new TikaException("media type must be set in metadata before parse"); + } + StringBuilder out = new StringBuilder(); + String line; + int nbLines = 0; + while ((line = reader.readLine()) != null) { + out + .append(line) + .append(System.getProperty("line.separator")); + String author = parserAuthor(line); + if (author != null) { + metadata.add(TikaCoreProperties.CREATOR, author); } - metadata.set("LoC", String.valueOf(nbLines)); - Renderer renderer = getRenderer(type.toString()); - - String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false); - - Schema schema = context.get(Schema.class, HTML_SCHEMA); + nbLines++; + } + metadata.set("LoC", String.valueOf(nbLines)); + Renderer renderer = getRenderer(type.toString()); - org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); - parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); - parser.setContentHandler(handler); - parser.parse(new InputSource(new StringReader(codeAsHtml))); + String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false); + Document document = Jsoup.parse(codeAsHtml); + document.quirksMode(Document.QuirksMode.quirks); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + try { + NodeTraversor.filter(new TikaNodeFilter(xhtml), document); + } catch (RuntimeSAXException e) { + throw e.getWrapped(); + } finally { + xhtml.endDocument(); } } - } - private Renderer getRenderer(String mimeType) { + private Renderer getRenderer(String mimeType) throws TikaException { MediaType mt = MediaType.parse(mimeType); String type = TYPES_TO_RENDERER.get(mt); if (type == null) { - throw new RuntimeException("unparseable content type " + mimeType); + throw new TikaException("unparseable content type " + mimeType); } return XhtmlRendererFactory.getRenderer(type); } @@ -141,9 +152,106 @@ private Renderer getRenderer(String mimeType) { private String parserAuthor(String line) { Matcher m = AUTHORPATTERN.matcher(line); if (m.find()) { - return m.group(1).trim(); + return m + .group(1) + .trim(); } return null; } + + private static class TikaNodeFilter implements NodeFilter { + boolean ignore = true; + ContentHandler handler; + + private TikaNodeFilter(ContentHandler handler) { + this.handler = handler; + } + + @Override + public NodeFilter.FilterResult head(Node node, int i) { + //skip document fragment + if ("html".equals(node.nodeName())) { + ignore = false; + } + if (ignore) { + return FilterResult.CONTINUE; + } + if (node instanceof TextNode) { + String txt = ((TextNode) node).getWholeText(); + if (txt != null) { + char[] chars = txt.toCharArray(); + try { + if (chars.length > 0) { + handler.characters(chars, 0, chars.length); + } + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + } + return NodeFilter.FilterResult.CONTINUE; + } else if (node instanceof DataNode) { + //maybe handle script data directly here instead of + //passing it through to the HTMLHandler? + String txt = ((DataNode) node).getWholeData(); + if (txt != null) { + char[] chars = txt.toCharArray(); + try { + if (chars.length > 0) { + handler.characters(chars, 0, chars.length); + } + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + } + return NodeFilter.FilterResult.CONTINUE; + } + AttributesImpl attributes = new AttributesImpl(); + Iterator jsoupAttrs = node + .attributes() + .iterator(); + while (jsoupAttrs.hasNext()) { + Attribute jsoupAttr = jsoupAttrs.next(); + attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue()); + } + try { + handler.startElement("", node.nodeName(), node.nodeName(), attributes); + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + return NodeFilter.FilterResult.CONTINUE; + } + + @Override + public NodeFilter.FilterResult tail(Node node, int i) { + if ("html".equals(node.nodeName())) { + ignore = true; + } + if (ignore) { + return FilterResult.CONTINUE; + } + if (node instanceof TextNode || node instanceof DataNode) { + return NodeFilter.FilterResult.CONTINUE; + } + + try { + handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName()); + } catch (SAXException e) { + throw new RuntimeSAXException(e); + } + return NodeFilter.FilterResult.CONTINUE; + } + } + + private static class RuntimeSAXException extends RuntimeException { + private SAXException wrapped; + + private RuntimeSAXException(SAXException e) { + this.wrapped = e; + } + + SAXException getWrapped() { + return wrapped; + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java index 45758f89bf..e932c066df 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java @@ -56,12 +56,12 @@ public void testHTMLRenderWithReturnLine() throws Exception { getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml; - assertTrue(htmlContent.indexOf("public") > + "public") > 0); - assertTrue(htmlContent.indexOf("static") > 0); - assertTrue(htmlContent.indexOf("") > 0); + assertTrue(htmlContent.indexOf("static") > 0); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java index c60d133b7b..0255a91618 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java @@ -93,16 +93,13 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, - ParseContext context) throws TikaException, SAXException { // throws IOException + ParseContext context) throws TikaException, IOException, SAXException { // throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 - try { - stream = UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get(); - htmlParser.parse(stream, handler, metadata, context); - } catch (IOException e) { - // Pushback overflow from tagsoup - } + stream = UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get(); + htmlParser.parse(stream, handler, metadata, context); + } } diff --git a/tika-server/tika-server-eval/pom.xml b/tika-server/tika-server-eval/pom.xml index 4e7275f3e3..52f8f594e0 100644 --- a/tika-server/tika-server-eval/pom.xml +++ b/tika-server/tika-server-eval/pom.xml @@ -69,7 +69,7 @@ org.apache.lucene:lucene-core:jar: org.apache.lucene:lucene-analysis-common:jar: org.apache.lucene:lucene-analysis-icu:jar: - org.ccil.cowan.tagsoup:tagsoup:jar: + org.jsoup:jar: com.ibm.icu:icu4j:jar: com.fasterxml.jackson.core:jackson-core:jar: com.fasterxml.jackson.core:jackson-databind:jar: