SSISO Community

시소당

HTML문서에서 의미있는 부분 뽑아서 파일로 저장하기



SAX와  같이  javax에서  제공해주는  기본  파서를  이용하여

html을  다루어  본다.  특히  callback클래스를  상속받아  자신만의  콜백  함수들을

만들어서  스트림을  읽어오면서  그에  따라  발생하는  이벤트를  핸들링하는  방식을  택했다.

  

/*

  *  Created  on  2004.  8.  19.  made  by  chchoi

  *  Intelligent  Systems  Lab.  Hanyang  Univ.

  *

  *  This  Class  is  Crawling  the  web  pages  which  have  a  url  that
  *  users  need  
  *
  *  By  using  target  URL,  get(String  target)  method  is  Collecting  string  of  the
  *  page's  source
  *
  */


import  javax.swing.text.html.*;  //  for  using  HTMLEditorKit
import  java.io.*;
import  java.net.*;
import  javax.swing.text.*;

  

public  class  CrawlingTheWeb
{
        ParserGetter  kit  =  new  ParserGetter();
        HTMLEditorKit.Parser  parser  =  kit.getParser();
        FileOutputStream  fs;
            
        public  void  parse(String  targeturl)
        {

                try{
                  URL  u          =  new  URL(targeturl);
                  InputStream  in    =  u.openStream();
                  InputStreamReader  r  =  new  InputStreamReader(in,"ksc5601");
                  fs  =  new  FileOutputStream("tempHTMLSrc.txt");
                  HTMLEditorKit.ParserCallback  callback  =
                          new  TagStripper(new  OutputStreamWriter(fs));
                  //  parsing
                  parser.parse(r,  callback,  true);
                  fs.close();
                }catch  (IOException  e)
                {
                        System.err.println(e);
                }
          }
        public  static  void  main(String[]  args)
        {
                new  CrawlingTheWeb().parse(args[0]);
        }
}

  

/*  by  callback  functions,  this  program  interprets  html  */

class  TagStripper  extends  HTMLEditorKit.ParserCallback{
      
        private  Writer  out;
        private  boolean  isInValid;

        public  TagStripper(Writer  out){
                this.out  =  out;
                      
        }
      


        public  void  handleComment(char[]  data,int  pos){
              
      
        }
      

        /*  all  of  the  text  except  the  tags  are  processed  */
        public  void  handleText(char[]  text,  int  position)
        {
          
                if(!isInValid){
                        try{
                                out.write(text);
                                out.flush();
                                
                        }
                        catch(IOException  e){
                                System.err.println(e);
                        }
                }
        }

  

        /*  this  puction  is  called  at  meeting  the  end  position  of  the  tags  */
        public  void  handleEndTag(HTML.Tag  tag,  int  position)
        {
                if(tag  ==  HTML.Tag.HEAD||
                                tag  ==  HTML.Tag.STYLE  ||
                                tag  ==  HTML.Tag.SCRIPT)
                {
                                  isInValid  =  false;
                }  
                /*  newly  add  */
                if(tag  ==  HTML.Tag.TD)
                {
                        try{
                                out.write('  ');
                                out.flush();
                      }
                      catch(IOException  e){
                              System.err.println(e);
                      }
                }
                if(tag  ==  HTML.Tag.HTML){  
                        try{
                                this.flush();
                        }catch(BadLocationException  be){
                                System.err.println(be);
                        }
                }
        }
      

      /*  this  puction  is  called  at  meeting  the  start  position  of  the  tags  */
        public  void  handleStartTag(HTML.Tag  tag,
                        MutableAttributeSet  attribute,  int  position)
        {
                if(tag  ==  HTML.Tag.HEAD||
                      tag  ==  HTML.Tag.STYLE  ||
                      tag  ==  HTML.Tag.SCRIPT  ){
                        isInValid  =  true;
                }
              
              //  else  isInValid  =  true;
      
        }

        /*  this  is  a  call  back  fucntion  to  be  called  by  Parser

          *  it  is  process  all  tags  */

        public  void  handleSimpleTag(HTML.Tag  tag,
                        MutableAttributeSet  attribute,  int  position)
        {
                try{
                        if(tag.isBlock()){
                                out.write("\r\n");
                                out.write("\r\n");
                        }
                        else  if(tag.breaksFlow()){
                                out.write("\r\n");
                        }
                        else{
                                out.write('  ');
                        }
                }catch(IOException  e){
                        System.err.println(e);
                }
              
        }
}

  

  

/*  Only  need  to  change  access  levels  from  protected  to  public  */

class  ParserGetter  extends  HTMLEditorKit{
        public  HTMLEditorKit.Parser  getParser(){
                  return  super.getParser();
        }
}

[출처]  HTML문서에서  의미있는  부분  뽑아서  파일로  저장하기|작성자  처리
http://blog.naver.com/ace772?Redirect=Log&logNo=120005152613

521 view

4.0 stars