Apache Tika 如何提取没有页眉和页脚内容的 html 正文

2023-11-30

我希望提取除页眉和页脚之外的 html 的整个正文内容,但是我遇到了异常

org.xml.sax.SAXException:命名空间http://www.w3.org/1999/xhtml未申报

下面是我如上所述创建的代码at

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;

import java.io.File;
//import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;


public class NewtikaXpath {
    public static void main(String args[]) throws IOException, SAXException, TikaException {
        XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
        Matcher divContentMatcher = xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:table/descendant::node()");
        ContentHandler xhandler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher);
        AutoDetectParser parser = new AutoDetectParser();
        //ToHTMLContentHandler htmlhandler = new ToHTMLContentHandler();
        //ContentHandler textHandler = new BodyContentHandler();
        Metadata xmetadata = new Metadata();
        try  (InputStream stream = TikaInputStream.get(new File("///httpmoneycnncomnewsworldiidHBNQL1.html"))){
            parser.parse(stream, xhandler, xmetadata);
            System.out.println("text:\n" + xhandler.toString());
        }
    }

}

我得到的例外是

Exception in thread "main" org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not declared
at org.apache.tika.sax.ToXMLContentHandler$ElementInfo.getPrefix(ToXMLContentHandler.java:62)
at org.apache.tika.sax.ToXMLContentHandler$ElementInfo.getQName(ToXMLContentHandler.java:68)
at org.apache.tika.sax.ToXMLContentHandler.startElement(ToXMLContentHandler.java:148)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.xpath.MatchingContentHandler.startElement(MatchingContentHandler.java:60)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.SecureContentHandler.startElement(SecureContentHandler.java:250)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.SafeContentHandler.startElement(SafeContentHandler.java:264)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:255)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:285)
at org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:171)
at org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:133)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:60)
at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:567)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:122)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:136)
at com.fractal.NewtikaXpath.main(NewtikaXpath.java:35)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)

虽然我确实明白,根据蒂卡-1215我们不应该包装内容处理程序,我没有看到任何替代方法来解决这个问题,因为简单的 bodycontenthandler 没有帮助,我验证了很多与此类似的 stackoverflow 案例,但我在任何地方都找不到解决方案。非常感谢任何建议、解决方案或指针。


找到了解决方案at基于研究锅炉管检测,它与 apache tika 集成,可以使用下面的 java 代码运行。

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;   

public class NewtikaXpath {
    public static void main(String args[]) throws IOException, SAXException, TikaException {
        AutoDetectParser parser = new AutoDetectParser();
        ContentHandler textHandler = new BodyContentHandler();
        Metadata xmetadata = new Metadata();
        try  (InputStream stream = TikaInputStream.get(new URL("your favourite url"))){
            parser.parse(stream, new BoilerpipeContentHandler(textHandler), xmetadata);
            System.out.println("text:\n" + textHandler.toString());
        }
    }

}

您可以进行简单的锅炉管道检测演示at..还可以提供更多信息at..

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

Apache Tika 如何提取没有页眉和页脚内容的 html 正文 的相关文章