python scrapy学习示例代码
import scrapy
class NewsSpider(scrapy.Spider):
name = 'news_spider'
allowed_domains = ['example.com']
start_urls = ['http://example.com/news']
def parse(self, response):
# 提取新闻详情页的链接
for news_link in response.css('a.news-link::attr(href)').getall():
yield response.follow(news_link, self.parse_news)
# 寻找下一页的链接
next_page = response.css('a.next-page::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
def parse_news(self, response):
# 提取新闻详情页的内容
title = response.css('h1.news-title::text').get()
content = response.css('div.news-content::text').getall()
# 这里你可以继续提取其他字段,比如发布日期、作者等
yield {
'title': title,
'content': content,
# 其他字段
}
爬虫技术仅供学习,请勿用于违法犯罪活动