



1. HtmlParser 简介


2. 建立Maven工程



<project xmlns="" xmlns:xsi=""


	 * 获取html中的主题和所有回复节点
	 * @param url
	 * @param ENCODE
	 * @return
	protected  NodeList getNodelist(String url, String ENCODE) {

		try {
			NodeList nodeList = null;
			Parser parser = new Parser(url);
			NodeFilter filter = new NodeFilter() {
				public boolean accept(Node node) {
					if(node.getText().contains("style=\"border-bottom: 0px;\"")) {
						return true;
					} else {
						return false;
			NodeFilter replyfilter = new NodeFilter() {
				public boolean accept(Node node) {
					String containsString = "id=\"r_";
					if(node.getText().contains(containsString)) {
						return true;
					} else {
						return false;
			OrFilter allFilter = new OrFilter(filter, replyfilter);
			nodeList = parser.extractAllNodesThatMatch(allFilter);
			return nodeList;
		} catch (ParserException e) {
			return null;





	public Forum parse2Thread(String url,String ENCODE) {
		List<Reply> replylist = new ArrayList<Reply>();	//回复列表
		Topic topic = new Topic();	//主题
		NodeFilter divFilter = new NodeClassFilter(Div.class);//div过滤器
		NodeFilter headingFilter = new NodeClassFilter(HeadingTag.class);//heading过滤器
		NodeFilter tagFilter = new NodeClassFilter(TagNode.class);//heading过滤器
		NodeList nodeList = this.getNodelist(url, ENCODE);

		for (int i = 0; i < nodeList.size(); i++) {
			Node node = nodeList.elementAt(i);
			if(node.getText().contains("style=\"border-bottom: 0px;\"")) {
				NodeList list = node.getChildren();//node的子节点
				//header div
				Node headerNode = list.extractAllNodesThatMatch(new NodeClassFilter(Div.class)).elementAt(0);
				Node h1Node = headerNode.getChildren().extractAllNodesThatMatch(headingFilter).elementAt(0);
				NodeList headerChrildrens = headerNode.getChildren();
				Node frNode = headerChrildrens.extractAllNodesThatMatch(divFilter).elementAt(0);
				ImageTag imgNode = (ImageTag) frNode.getFirstChild().getFirstChild();
				//cell div
				Node cellNode = list.extractAllNodesThatMatch(divFilter).elementAt(1);
				Node topic_content = cellNode.getChildren().extractAllNodesThatMatch(divFilter).elementAt(0);
				Node markdown_body = topic_content.getChildren().extractAllNodesThatMatch(divFilter).elementAt(0);

			} else if(node.getText().contains("id=\"r_")){
				Reply reply = new Reply();
				Node tableNode = node.getChildren().extractAllNodesThatMatch(tagFilter).elementAt(0);
				Node trNode = tableNode.getChildren().extractAllNodesThatMatch(tagFilter).elementAt(0);
				NodeList tagList = trNode.getChildren().extractAllNodesThatMatch(tagFilter);
				ImageTag reply_img = (ImageTag) tagList.elementAt(0).getChildren().extractAllNodesThatMatch(tagFilter).elementAt(0);
				//nodeList bodyNode = tagList;
		Forum forum = new Forum(topic, replylist);
		return null;


	public  void test() throws Exception {
		Html2Domain parse = new Html2DomainImpl();



附上项目代码:测试使用的是jdk1.6+eclipse kepler


  java解析html之HTMLparser初次尝试

    为了爬取一个网页的数据 xff0c 尝试了一下Htmlparser来做小爬虫 下面是一个小案例 xff0c 用来爬取论坛的帖子内容 1 HtmlParser 简介 htmlparser是一个纯的java写的html解析的库 xff0c 主要