본문 바로가기
Python/0x01-url

html parser class ][ 보안뉴스 가장 많이 본 뉴스 keyword 가져오기

by SpeeDr00t 2017. 10. 3.
반응형

html parser class ][ 보안뉴스 가장 많이 본 뉴스 keyword 가져오기


1) urllib3 버전

# -*- coding: utf-8 -*-
# 
# write by kyoung chip , jang
#
# python 3.6
# pip list
# beautifulsoup4 , urllib3
#
# pip install urllib3
# pip install bs4
#
import urllib
import urllib.request
from bs4 import BeautifulSoup

class CHtmlParser :

    def __init__( self ) :
	
        self.url = ''
        self.response = ''

    def setUrl( self , url ) :
	
        self.url = url
		
    def getHtml( self , url ) :

        self.response = BeautifulSoup(urllib.request.urlopen(url).read(), 'html.parser',from_encoding='utf-8')    
        return self.response
        		

		
class CBoanNews :

    def __init__ ( self ) :
	
        self.html = CHtmlParser()
		
		
    def doWork( self ) :
	
        response = self.html.getHtml("http://www.boannews.com/media/o_list.asp?kind=B")
        data = response.findAll("div","news_list")
		
        		
        
        for r in data:

            keyword = r.findAll('span','news_txt')
            print( str(keyword)[str(keyword).find('>')+1:str(keyword).find('span>')-2] )			
        

			
if __name__ == '__main__':

    r = CBoanNews()
    r.doWork()


	
			
			



2) requests 버전

# -*- coding: utf-8 -*-
# 
# write by kyoung chip , jang
#
# python 3.6
#
# pip list
# requests , beautifulsoup4
#
# pip install requests
# pip install bs4
#
import requests
from bs4 import BeautifulSoup


		
class CHtmlParser :

    def __init__( self ) :

        self.req = ''	

		
		
    def login( self, url , user , passwd ) :
	
        self.req = requests.get( url , auth=(user , passwd) )
        print( self.req.text )
		
		
		
    def getHtml( self , url   ) :

        self.req = requests.get( url  )	
		
        '''		
        print("status code %s"  % ( self.req.status_code ) )
        print("headers %s " % ( self.req.headers['content-type'] ) )
        print("encoding %s " % ( self.req.encoding ) )
        print("text %s " % ( self.req.text ) )
        print("json %s " % ( self.req.json ) )
        '''
		
        return BeautifulSoup( self.req.text, 'html.parser')    
		
		
        		


class CBoanNews :

    def __init__ ( self ) :
	
        self.html = CHtmlParser()
		
		       
    def doWork( self ) :
	
        response = self.html.getHtml("http://www.boannews.com/media/o_list.asp?kind=B")

        data = response.findAll("div","news_list")
		
        		
        
        for r in data:

            keyword = r.findAll('span','news_txt')
            print( str(keyword)[str(keyword).find('>')+1:str(keyword).find('span>')-2] )
			
        
if __name__ == '__main__':

    r = CBoanNews()
    r.doWork()


반응형

'Python > 0x01-url' 카테고리의 다른 글

wget  (0) 2017.10.06
html parser class ][ security focus title과 link정보 가져오기  (0) 2017.10.05
request class  (0) 2017.10.03
url parser ][ CUrlParser  (0) 2017.10.03