30天学会Python编程:20. Python网络爬虫简介

wptr33 2025-07-08 23:41 64 浏览

20.1 网络爬虫基础

20.1.1 爬虫定义与原理

20.1.2 法律与道德规范

表19-1 爬虫合法性要点

注意事项	说明	合规建议
robots协议	网站访问规则	遵守robots.txt
访问频率	请求间隔控制	添加适当延迟
数据使用	版权与隐私	仅用于合法用途
用户认证	登录权限	不破解验证机制

20.2 请求库使用

20.2.1 requests库

基本使用：

import requests

def fetch_page(url):
    try:
        response = requests.get(
            url,
            headers={
                'User-Agent': 'Mozilla/5.0',
                'Accept-Language': 'zh-CN'
            },
            timeout=5
        )
        response.raise_for_status()  # 检查HTTP状态码
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
        return None

# 示例使用
html = fetch_page('https://example.com')

20.2.2 高级请求技巧

# 会话保持
session = requests.Session()
session.get('https://example.com/login', params={'user': 'test'})

# 代理设置
proxies = {
    'http': 'http://10.10.1.10:3128',
    'https': 'http://10.10.1.10:1080'
}
response = requests.get(url, proxies=proxies)

# 文件下载
with requests.get('https://example.com/image.jpg', stream=True) as r:
    with open('image.jpg', 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

20.3 数据解析技术

20.3.1 BeautifulSoup解析

from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, 'lxml')
    
    # CSS选择器
    titles = soup.select('h1.article-title')
    
    # 属性提取
    links = [a['href'] for a in soup.find_all('a', class_='external')]
    
    # 文本处理
    content = soup.find('div', id='content').get_text(strip=True, separator='\n')
    
    return {
        'titles': [t.text for t in titles],
        'links': links,
        'content': content
    }

20.3.2 XPath与lxml

from lxml import etree

def xpath_parse(html):
    tree = etree.HTML(html)
    
    # 提取商品价格
    prices = tree.xpath('//div[@class="price"]/text()')
    
    # 提取嵌套数据
    items = []
    for item in tree.xpath('//div[@class="product"]'):
        items.append({
            'name': item.xpath('.//h2/text()')[0],
            'sku': item.xpath('./@data-sku')[0]
        })
    
    return {'prices': prices, 'items': items}

20.4 动态页面处理

20.4.1 Selenium自动化

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def selenium_crawl(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # 无头模式
    driver = webdriver.Chrome(options=options)
    
    try:
        driver.get(url)
        
        # 等待元素加载
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".dynamic-content"))
        )
        
        # 执行JavaScript
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # 获取渲染后页面
        html = driver.page_source
        return html
    finally:
        driver.quit()

20.4.2 接口逆向分析

import json

def api_crawl():
    # 分析XHR请求
    api_url = 'https://api.example.com/data'
    params = {
        'page': 1,
        'size': 20,
        'timestamp': int(time.time()*1000)
    }
    
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # 解析JSON数据
    for item in data['list']:
        print(f"商品: {item['name']}, 价格: {item['price']}")

20.5 数据存储方案

20.5.1 文件存储

import csv
import json

def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

20.5.2 数据库存储

import sqlite3
import pymongo

# SQLite存储
def sqlite_save(data):
    conn = sqlite3.connect('data.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS products
               (id TEXT, name TEXT, price REAL)''')
    c.executemany('INSERT INTO products VALUES (?,?,?)',
                 [(d['id'], d['name'], d['price']) for d in data])
    conn.commit()

# MongoDB存储
def mongo_save(data):
    client = pymongo.MongoClient('mongodb://localhost:27017/')
    db = client['web_data']
    collection = db['products']
    collection.insert_many(data)

20.6 反爬应对策略

20.6.1 常见反爬机制

表19-2 常见反爬技术与应对

反爬技术	识别特征	破解方法
User-Agent检测	无浏览器特征	轮换User-Agent
IP限制	频繁访问被封	使用代理IP池
验证码	出现验证页面	打码平台/OCR识别
请求参数加密	参数含加密字段	逆向JS分析
动态渲染	数据通过JS加载	Selenium/Puppeteer

20.6.2 高级反反爬技巧

# 代理IP池示例
class ProxyPool:
    def __init__(self):
        self.proxies = [
            'http://ip1:port',
            'http://ip2:port',
            # ...
        ]
        self.current = 0
    
    def get_proxy(self):
        proxy = self.proxies[self.current % len(self.proxies)]
        self.current += 1
        return {'http': proxy, 'https': proxy}

# 请求头随机生成
from fake_useragent import UserAgent
ua = UserAgent()

def get_random_headers():
    return {
        'User-Agent': ua.random,
        'Referer': 'https://www.google.com/',
        'Accept-Encoding': 'gzip, deflate, br'
    }

20.7 应用举例

案例1：电商商品爬虫

import requests
from bs4 import BeautifulSoup
import time
import random

def ecommerce_crawler(base_url, max_page=10):
    products = []
    
    for page in range(1, max_page+1):
        # 带延迟的请求
        time.sleep(random.uniform(1, 3))
        
        url = f"{base_url}?page={page}"
        html = fetch_page(url)
        if not html:
            continue
        
        soup = BeautifulSoup(html, 'lxml')
        items = soup.select('.product-item')
        
        for item in items:
            try:
                products.append({
                    'name': item.select_one('.name').text.strip(),
                    'price': float(item.select_one('.price').text.replace('yen', '')),
                    'sku': item['data-sku'],
                    'rating': item.select_one('.rating').text.strip()
                })
            except Exception as e:
                print(f"解析失败: {e}")
    
    save_to_csv(products, 'products.csv')
    return products

# 使用示例
ecommerce_crawler('https://example.com/products')

案例2：新闻聚合爬虫

import schedule
import datetime

def news_monitor():
    sources = [
        'https://news.source1.com/rss',
        'https://news.source2.com/api/latest'
    ]
    
    all_news = []
    
    for url in sources:
        try:
            if 'rss' in url:
                # 解析RSS
                news = parse_rss(url)
            else:
                # 调用API
                news = parse_news_api(url)
            all_news.extend(news)
        except Exception as e:
            print(f"爬取失败 {url}: {e}")
    
    # 去重存储
    store_news(all_news)
    print(f"{datetime.datetime.now()} 已抓取{len(all_news)}条新闻")

# 定时任务
schedule.every(1).hours.do(news_monitor)

while True:
    schedule.run_pending()
    time.sleep(60)

20.8 知识图谱

20.9 学习总结

核心要点：

掌握HTTP请求与响应处理
熟练使用主流解析工具
理解动态页面加载原理
能够应对常见反爬措施

实践建议：

遵守爬虫道德规范
添加随机请求延迟
实现异常处理机制
定期维护代理池

进阶方向：

分布式爬虫架构
验证码智能识别
数据清洗与分析
反爬JS逆向工程

常见陷阱：

触发网站防护机制
页面结构变更导致解析失败
未处理编码问题
法律风险意识不足

持续更新Python编程学习日志与技巧，敬请关注！

#编程# #学习# #python# #在头条记录我的2025#

python timestamp

上一篇：「ELK」elastalert 日志告警（elk日志平台）
下一篇：Python学不会来打我(25)函数参数传递详解:值传递?引用传递?

30天学会Python编程:20. Python网络爬虫简介

20.1 网络爬虫基础

20.1.1 爬虫定义与原理

20.1.2 法律与道德规范

20.2 请求库使用

20.2.1 requests库

20.2.2 高级请求技巧

20.3 数据解析技术

20.3.1 BeautifulSoup解析

20.3.2 XPath与lxml

20.4 动态页面处理

20.4.1 Selenium自动化

20.4.2 接口逆向分析

20.5 数据存储方案

20.5.1 文件存储

20.5.2 数据库存储

20.6 反爬应对策略

20.6.1 常见反爬机制

20.6.2 高级反反爬技巧

20.7 应用举例

案例1：电商商品爬虫

案例2：新闻聚合爬虫

20.8 知识图谱

20.9 学习总结

相关推荐

C# 13 和 .NET 9 全知道 :13 使用 ASP.NET Core 构建网站 (1)

程序员的开源月刊《HelloGitHub》第 71 期

SparkSQL——DataFrame的创建与使用

如何将AI助手接入微信（打开ai手机助手）

使用过 Redis 分布式锁么，它是什么回事?

VUE循环语句的使用(v-for)（vuefor循环的key）

HiveOs系统教程最细手把手教学（hiveos启动）

HIVE SQL基础语法（hive-sql）

Spring Boot 概述（spring boot干嘛的）

《循环(for/while)》（循环while语句）