Categories
Python

Python 3.7 + BeautifulSoup 4 + Scrapy :简单爬虫实例

放假无聊,无聊写个爬虫,把本博客的文章列表页面(https://xingxiang.me/blog)扒下来

1. Python 3.7 + BeautifulSoup 4

输出 .csv 文件,如图:

代码如下:

import requests
import csv
import random
import time
import socket
import http.client
from bs4 import BeautifulSoup

def get_html(url, data = None):
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    timeout = random.choice(range(80, 100))
    while True:
        try:
            response = requests.get(url, headers = header, timeout = timeout)
            response.encoding = 'utf-8'
            break
        except socket.timeout as e:
            print(e)
            time.sleep(random.choice(range(20, 60)))
        except socket.error as e:
            print(e)
            time.sleep(random.choice(range(0, 60)))
        except http.client.BadStatusLine as e:
            print(e)
            time.sleep(random.choice(range(30, 60)))
        except http.client.IncompleteRead as e:
            print(e)
            time.sleep(random.choice(range(20, 60)))
    return response.text
    
def get_data(html_text):
    result = []
    bs = BeautifulSoup(html_text, "html.parser")
    content = bs.find_all('div', {'class': 'brick'})
    for blog in content:
        temp = []
        a_tag = blog.find('a')
        a_tag.span.extract()
        temp.append(a_tag['href'])
        temp.append(a_tag.text)
        result.append(temp)
    
    return result
    
def data_output(data, filename):
    with open(filename, 'a', errors='ignore', newline='') as f:
        f_csv = csv.writer(f)
        f_csv.writerows(data)
        
if __name__ == '__main__':
    url = 'https://xingxiang.me/blog'
    html = get_html(url)
    result = get_data(html)
    data_output(result, 'blog.csv')

2.Python 3.7 + Scrapy

输出 .json 文件,如图:

代码如下

#myblog_spider.py
import scrapy
from myblog_spider.items import MyblogSpiderItem

class myBlogSpider(scrapy.Spider):
	name = 'myBlogSpider'
	allow_domains = ['xingxiang.me']
	start_urls = ["https://xingxiang.me/blog"]

	def parse(self, response):
		for blog in response.xpath('//div[@class="brick"]/a'):
			item = MyblogSpiderItem()
			item['url'] = blog.xpath('@href').extract_first()
			item['title'] = blog.xpath('text()').extract_first()
			yield item
#items.py
import scrapy

class MyblogSpiderItem(scrapy.Item):
	# define the fields for your item here like:
	# name = scrapy.Field()
	title = scrapy.Field()
	url = scrapy.Field()
	pass
pipelines.py
import codecs
import json

class MyblogSpiderPipeline(object):
	def __init__(self):        
		self.file = codecs.open('logs.json', 'w', encoding='utf-8')
	def process_item(self, item, spider):
		line = json.dumps(dict(item), ensure_ascii=False) + "\n"
		self.file.write(line)
		return item
	def spider_closed(self, spider):
		self.file.close()

Leave a Reply

Your email address will not be published. Required fields are marked *

*

code