시소당
SAX와 같이 javax에서 제공해주는 기본 파서를 이용하여
html을 다루어 본다. 특히 callback클래스를 상속받아 자신만의 콜백 함수들을
만들어서 스트림을 읽어오면서 그에 따라 발생하는 이벤트를 핸들링하는 방식을 택했다.
/*
* Created on 2004. 8. 19. made by chchoi
* Intelligent Systems Lab. Hanyang Univ.
*
* This Class is Crawling the web pages which have a url that
* users need
*
* By using target URL, get(String target) method is Collecting string of the
* page's source
*
*/
import javax.swing.text.html.*; // for using HTMLEditorKit
import java.io.*;
import java.net.*;
import javax.swing.text.*;
public class CrawlingTheWeb
{
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
FileOutputStream fs;
public void parse(String targeturl)
{
try{
URL u = new URL(targeturl);
InputStream in = u.openStream();
InputStreamReader r = new InputStreamReader(in,"ksc5601");
fs = new FileOutputStream("tempHTMLSrc.txt");
HTMLEditorKit.ParserCallback callback =
new TagStripper(new OutputStreamWriter(fs));
// parsing
parser.parse(r, callback, true);
fs.close();
}catch (IOException e)
{
System.err.println(e);
}
}
public static void main(String[] args)
{
new CrawlingTheWeb().parse(args[0]);
}
}
/* by callback functions, this program interprets html */
class TagStripper extends HTMLEditorKit.ParserCallback{
private Writer out;
private boolean isInValid;
public TagStripper(Writer out){
this.out = out;
}
public void handleComment(char[] data,int pos){
}
/* all of the text except the tags are processed */
public void handleText(char[] text, int position)
{
if(!isInValid){
try{
out.write(text);
out.flush();
}
catch(IOException e){
System.err.println(e);
}
}
}
/* this puction is called at meeting the end position of the tags */
public void handleEndTag(HTML.Tag tag, int position)
{
if(tag == HTML.Tag.HEAD||
tag == HTML.Tag.STYLE ||
tag == HTML.Tag.SCRIPT)
{
isInValid = false;
}
/* newly add */
if(tag == HTML.Tag.TD)
{
try{
out.write(' ');
out.flush();
}
catch(IOException e){
System.err.println(e);
}
}
if(tag == HTML.Tag.HTML){
try{
this.flush();
}catch(BadLocationException be){
System.err.println(be);
}
}
}
/* this puction is called at meeting the start position of the tags */
public void handleStartTag(HTML.Tag tag,
MutableAttributeSet attribute, int position)
{
if(tag == HTML.Tag.HEAD||
tag == HTML.Tag.STYLE ||
tag == HTML.Tag.SCRIPT ){
isInValid = true;
}
// else isInValid = true;
}
/* this is a call back fucntion to be called by Parser
* it is process all tags */
public void handleSimpleTag(HTML.Tag tag,
MutableAttributeSet attribute, int position)
{
try{
if(tag.isBlock()){
out.write("\r\n");
out.write("\r\n");
}
else if(tag.breaksFlow()){
out.write("\r\n");
}
else{
out.write(' ');
}
}catch(IOException e){
System.err.println(e);
}
}
}
/* Only need to change access levels from protected to public */
class ParserGetter extends HTMLEditorKit{
public HTMLEditorKit.Parser getParser(){
return super.getParser();
}
}
[출처] HTML문서에서 의미있는 부분 뽑아서 파일로 저장하기|작성자 처리
http://blog.naver.com/ace772?Redirect=Log&logNo=120005152613