반응형
html parser class ][ 보안뉴스 가장 많이 본 뉴스 keyword 가져오기
1) urllib3 버전
# -*- coding: utf-8 -*- # # write by kyoung chip , jang # # python 3.6 # pip list # beautifulsoup4 , urllib3 # # pip install urllib3 # pip install bs4 # import urllib import urllib.request from bs4 import BeautifulSoup class CHtmlParser : def __init__( self ) : self.url = '' self.response = '' def setUrl( self , url ) : self.url = url def getHtml( self , url ) : self.response = BeautifulSoup(urllib.request.urlopen(url).read(), 'html.parser',from_encoding='utf-8') return self.response class CBoanNews : def __init__ ( self ) : self.html = CHtmlParser() def doWork( self ) : response = self.html.getHtml("http://www.boannews.com/media/o_list.asp?kind=B") data = response.findAll("div","news_list") for r in data: keyword = r.findAll('span','news_txt') print( str(keyword)[str(keyword).find('>')+1:str(keyword).find('span>')-2] ) if __name__ == '__main__': r = CBoanNews() r.doWork()
2) requests 버전
# -*- coding: utf-8 -*- # # write by kyoung chip , jang # # python 3.6 # # pip list # requests , beautifulsoup4 # # pip install requests # pip install bs4 # import requests from bs4 import BeautifulSoup class CHtmlParser : def __init__( self ) : self.req = '' def login( self, url , user , passwd ) : self.req = requests.get( url , auth=(user , passwd) ) print( self.req.text ) def getHtml( self , url ) : self.req = requests.get( url ) ''' print("status code %s" % ( self.req.status_code ) ) print("headers %s " % ( self.req.headers['content-type'] ) ) print("encoding %s " % ( self.req.encoding ) ) print("text %s " % ( self.req.text ) ) print("json %s " % ( self.req.json ) ) ''' return BeautifulSoup( self.req.text, 'html.parser') class CBoanNews : def __init__ ( self ) : self.html = CHtmlParser() def doWork( self ) : response = self.html.getHtml("http://www.boannews.com/media/o_list.asp?kind=B") data = response.findAll("div","news_list") for r in data: keyword = r.findAll('span','news_txt') print( str(keyword)[str(keyword).find('>')+1:str(keyword).find('span>')-2] ) if __name__ == '__main__': r = CBoanNews() r.doWork()
반응형
'Python > 0x01-url' 카테고리의 다른 글
wget (0) | 2017.10.06 |
---|---|
html parser class ][ security focus title과 link정보 가져오기 (0) | 2017.10.05 |
request class (0) | 2017.10.03 |
url parser ][ CUrlParser (0) | 2017.10.03 |