黄冈市网站建设_网站建设公司_电商网站_seo优化
2026/1/2 14:49:51 网站建设 项目流程

引言:在线教育时代的数据获取需求

随着在线教育的蓬勃发展,各种在线课程平台如雨后春笋般涌现。对于学习者、教育研究者和内容分析者来说,获取这些平台的课程信息具有重要价值。本文将介绍如何使用最新的Python爬虫技术,特别是异步编程技术,高效、稳定地爬取在线课程信息。

技术选型:现代Python爬虫技术栈

  1. 异步编程:使用asyncioaiohttp实现高并发爬取

  2. 解析工具parsel(基于Scrapy Selector的独立库)和BeautifulSoup4

  3. 浏览器自动化playwright处理JavaScript渲染页面

  4. 数据存储pandassqlalchemy进行结构化存储

  5. 代理和反爬:使用代理池和随机User-Agent

1. 环境配置与依赖安装

bash

# 安装必要依赖 pip install aiohttp asyncio parsel beautifulsoup4 pandas sqlalchemy playwright python -m playwright install # 安装浏览器驱动

2. 基础配置模块

python

# config.py import asyncio from dataclasses import dataclass from typing import List, Optional import random @dataclass class CrawlerConfig: """爬虫配置类""" # 请求配置 MAX_CONCURRENT_REQUESTS: int = 10 REQUEST_TIMEOUT: int = 30 RETRY_TIMES: int = 3 # 爬取延迟(避免被封) MIN_DELAY: float = 0.5 MAX_DELAY: float = 2.0 # 代理配置 USE_PROXY: bool = False PROXY_POOL: List[str] = None # 输出配置 OUTPUT_FORMAT: str = "csv" # csv, json, database OUTPUT_FILE: str = "courses_data.csv" # 数据库配置 DATABASE_URL: str = "sqlite:///courses.db" class UserAgentManager: """User-Agent管理类""" DESKTOP_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15' ] MOBILE_AGENTS = [ 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1', 'Mozilla/5.0 (Linux; Android 14; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.210 Mobile Safari/537.36' ] @classmethod def get_random_ua(cls, device_type: str = "desktop") -> str: """获取随机User-Agent""" if device_type == "mobile": return random.choice(cls.MOBILE_AGENTS) return random.choice(cls.DESKTOP_AGENTS)

3. 异步爬虫核心引擎

python

# crawler_engine.py import aiohttp import asyncio import logging from typing import Dict, Any, Optional from datetime import datetime import backoff from config import CrawlerConfig, UserAgentManager class AsyncCrawlerEngine: """异步爬虫引擎""" def __init__(self, config: CrawlerConfig): self.config = config self.session = None self.semaphore = asyncio.Semaphore(config.MAX_CONCURRENT_REQUESTS) self.logger = self._setup_logger() def _setup_logger(self): """配置日志系统""" logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # 控制台处理器 console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) # 文件处理器 file_handler = logging.FileHandler( f'crawler_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log' ) file_handler.setLevel(logging.DEBUG) # 格式化器 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) console_handler.setFormatter(formatter) file_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.addHandler(file_handler) return logger async def __aenter__(self): """异步上下文管理器入口""" self.session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=self.config.REQUEST_TIMEOUT), headers={'User-Agent': UserAgentManager.get_random_ua()} ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """异步上下文管理器出口""" if self.session: await self.session.close() @backoff.on_exception( backoff.expo, (aiohttp.ClientError, asyncio.TimeoutError), max_tries=3 ) async def fetch_url(self, url: str, **kwargs) -> Optional[str]: """异步获取URL内容""" async with self.semaphore: try: headers = kwargs.get('headers', {}) headers['User-Agent'] = UserAgentManager.get_random_ua() proxy = None if self.config.USE_PROXY and self.config.PROXY_POOL: proxy = random.choice(self.config.PROXY_POOL) async with self.session.get( url, headers=headers, proxy=proxy, **kwargs ) as response: response.raise_for_status() # 随机延迟,避免请求过快 await asyncio.sleep( random.uniform( self.config.MIN_DELAY, self.config.MAX_DELAY ) ) return await response.text() except Exception as e: self.logger.error(f"请求失败 {url}: {str(e)}") return None async def fetch_multiple_urls(self, urls: list) -> Dict[str, Optional[str]]: """并发获取多个URL""" tasks = [self.fetch_url(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) return { url: result if not isinstance(result, Exception) else None for url, result in zip(urls, results) }

4. 智能解析模块

python

# parser.py from typing import Dict, List, Optional, Any from parsel import Selector from bs4 import BeautifulSoup import json import re class CourseParser: """课程信息解析器""" @staticmethod def parse_with_parsel(html: str, base_url: str = "") -> Dict[str, Any]: """使用Parsel解析HTML""" selector = Selector(text=html) # 尝试多种选择器策略 course_data = { 'title': CourseParser._extract_title(selector), 'instructor': CourseParser._extract_instructor(selector), 'price': CourseParser._extract_price(selector), 'rating': CourseParser._extract_rating(selector), 'enrollment_count': CourseParser._extract_enrollment(selector), 'duration': CourseParser._extract_duration(selector), 'category': CourseParser._extract_category(selector), 'description': CourseParser._extract_description(selector), 'source_url': base_url, 'platform': CourseParser._detect_platform(base_url) } return course_data @staticmethod def parse_json_ld(html: str) -> Optional[Dict[str, Any]]: """解析JSON-LD结构化数据""" soup = BeautifulSoup(html, 'html.parser') json_ld_scripts = soup.find_all( 'script', type='application/ld+json' ) for script in json_ld_scripts: try: data = json.loads(script.string) if data.get('@type') in ['Course', 'Product', 'CreativeWork']: return data except: continue return None @staticmethod def _extract_title(selector: Selector) -> str: """提取课程标题""" selectors = [ '//h1[@class="course-title"]/text()', '//h1[contains(@class, "title")]/text()', '//meta[@property="og:title"]/@content', '//title/text()' ] for xpath in selectors: result = selector.xpath(xpath).get() if result and result.strip(): return result.strip() return "" @staticmethod def _extract_price(selector: Selector) -> float: """提取课程价格""" price_selectors = [ '//span[@class="price"]/text()', '//*[contains(@class, "price")]/text()', '//meta[@property="product:price:amount"]/@content' ] for xpath in price_selectors: price_text = selector.xpath(xpath).get() if price_text: # 提取数字 numbers = re.findall(r'\d+\.?\d*', price_text) if numbers: return float(numbers[0]) return 0.0 @staticmethod def _extract_rating(selector: Selector) -> float: """提取课程评分""" rating_selectors = [ '//meta[@property="og:rating"]/@content', '//span[@class="rating"]/text()', '//*[contains(@class, "rating")]/@aria-label' ] for xpath in rating_selectors: rating_text = selector.xpath(xpath).get() if rating_text: numbers = re.findall(r'\d+\.?\d*', rating_text) if numbers: return float(numbers[0]) return 0.0 @staticmethod def _detect_platform(url: str) -> str: """检测课程平台""" platforms = { 'coursera.org': 'Coursera', 'udemy.com': 'Udemy', 'edx.org': 'edX', 'khanacademy.org': 'Khan Academy', 'linkedin.com/learning': 'LinkedIn Learning', 'skillshare.com': 'Skillshare', 'pluralsight.com': 'Pluralsight' } for domain, name in platforms.items(): if domain in url: return name return "Unknown" # 其他提取方法类似,限于篇幅不全部展开 @staticmethod def _extract_instructor(selector: Selector) -> str: # 提取讲师信息 pass @staticmethod def _extract_enrollment(selector: Selector) -> int: # 提取报名人数 pass @staticmethod def _extract_duration(selector: Selector) -> str: # 提取课程时长 pass @staticmethod def _extract_category(selector: Selector) -> str: # 提取课程分类 pass @staticmethod def _extract_description(selector: Selector) -> str: # 提取课程描述 pass class JavaScriptRenderer: """JavaScript渲染处理器""" def __init__(self): self.browser = None async def __aenter__(self): from playwright.async_api import async_playwright self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch( headless=True, args=['--disable-blink-features=AutomationControlled'] ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.browser: await self.browser.close() if hasattr(self, 'playwright'): await self.playwright.stop() async def render_page(self, url: str) -> str: """渲染JavaScript页面""" context = await self.browser.new_context( user_agent=UserAgentManager.get_random_ua(), viewport={'width': 1920, 'height': 1080} ) page = await context.new_page() # 添加反反爬措施 await page.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); """) try: await page.goto(url, wait_until='networkidle') # 等待页面加载完成 await page.wait_for_timeout(2000) return await page.content() finally: await page.close() await context.close()

5. 数据存储模块

python

# storage.py import pandas as pd import json import sqlite3 from sqlalchemy import create_engine, Table, Column, MetaData from sqlalchemy import String, Float, Integer, Text, DateTime from datetime import datetime from typing import List, Dict, Any class DataStorage: """数据存储管理器""" def __init__(self, config: CrawlerConfig): self.config = config self.data_buffer = [] def add_to_buffer(self, course_data: Dict[str, Any]): """添加数据到缓冲区""" course_data['crawled_at'] = datetime.now() self.data_buffer.append(course_data) # 缓冲区达到一定大小后自动保存 if len(self.data_buffer) >= 100: self.flush_buffer() def flush_buffer(self): """刷新缓冲区到存储""" if not self.data_buffer: return if self.config.OUTPUT_FORMAT == 'csv': self._save_to_csv() elif self.config.OUTPUT_FORMAT == 'json': self._save_to_json() elif self.config.OUTPUT_FORMAT == 'database': self._save_to_database() self.data_buffer.clear() print(f"数据已保存,共{len(self.data_buffer)}条记录") def _save_to_csv(self): """保存为CSV文件""" df = pd.DataFrame(self.data_buffer) # 如果文件已存在,追加数据 try: existing_df = pd.read_csv(self.config.OUTPUT_FILE) df = pd.concat([existing_df, df], ignore_index=True) except FileNotFoundError: pass df.to_csv(self.config.OUTPUT_FILE, index=False, encoding='utf-8-sig') def _save_to_json(self): """保存为JSON文件""" try: with open(self.config.OUTPUT_FILE, 'r', encoding='utf-8') as f: existing_data = json.load(f) existing_data.extend(self.data_buffer) except FileNotFoundError: existing_data = self.data_buffer with open(self.config.OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(existing_data, f, ensure_ascii=False, indent=2) def _save_to_database(self): """保存到数据库""" engine = create_engine(self.config.DATABASE_URL) # 定义表结构 metadata = MetaData() courses_table = Table( 'courses', metadata, Column('id', Integer, primary_key=True, autoincrement=True), Column('title', String(500)), Column('instructor', String(200)), Column('price', Float), Column('rating', Float), Column('enrollment_count', Integer), Column('duration', String(50)), Column('category', String(100)), Column('description', Text), Column('platform', String(100)), Column('source_url', String(500)), Column('crawled_at', DateTime), Column('created_at', DateTime, default=datetime.now) ) # 创建表(如果不存在) metadata.create_all(engine) # 插入数据 df = pd.DataFrame(self.data_buffer) df.to_sql('courses', engine, if_exists='append', index=False) def close(self): """关闭存储,确保所有数据已保存""" if self.data_buffer: self.flush_buffer()

6. 主爬虫类

python

# main_crawler.py import asyncio from typing import List, Dict, Any from urllib.parse import urljoin, urlparse import re from config import CrawlerConfig from crawler_engine import AsyncCrawlerEngine from parser import CourseParser, JavaScriptRenderer from storage import DataStorage class OnlineCourseCrawler: """在线课程爬虫主类""" def __init__(self, config: CrawlerConfig): self.config = config self.crawler_engine = AsyncCrawlerEngine(config) self.storage = DataStorage(config) self.visited_urls = set() self.platform_patterns = { 'coursera': r'https?://www\.coursera\.org/learn/[^/]+', 'udemy': r'https?://www\.udemy\.com/course/[^/]+', 'edx': r'https?://www\.edx\.org/learn/[^/]+', 'skillshare': r'https?://www\.skillshare\.com/[^/]+' } async def crawl_course_page(self, url: str) -> Dict[str, Any]: """爬取单个课程页面""" # 检查是否已访问 if url in self.visited_urls: return {} self.visited_urls.add(url) # 尝试直接获取 html = await self.crawler_engine.fetch_url(url) # 如果返回空,可能是JavaScript渲染的页面 if not html or len(html) < 1000: try: async with JavaScriptRenderer() as renderer: html = await renderer.render_page(url) except Exception as e: print(f"JavaScript渲染失败 {url}: {str(e)}") return {} if not html: return {} # 解析课程信息 course_data = CourseParser.parse_with_parsel(html, url) # 尝试解析JSON-LD json_ld_data = CourseParser.parse_json_ld(html) if json_ld_data: course_data.update(self._extract_from_json_ld(json_ld_data)) # 保存数据 if course_data.get('title'): self.storage.add_to_buffer(course_data) print(f"成功爬取: {course_data['title']}") return course_data async def crawl_course_list(self, list_url: str, max_pages: int = 10): """爬取课程列表页""" for page in range(1, max_pages + 1): # 构建分页URL(不同平台分页方式不同) paginated_url = self._build_pagination_url(list_url, page) html = await self.crawler_engine.fetch_url(paginated_url) if not html: continue # 提取课程详情页链接 course_links = self._extract_course_links(html, list_url) # 并发爬取所有课程详情页 tasks = [self.crawl_course_page(link) for link in course_links] await asyncio.gather(*tasks) # 检查是否还有下一页 if not self._has_next_page(html): break def _extract_course_links(self, html: str, base_url: str) -> List[str]: """从列表页提取课程链接""" selector = Selector(text=html) # 尝试多种选择器 link_selectors = [ '//a[contains(@href, "course")]/@href', '//a[contains(@class, "course-link")]/@href', '//article//a/@href', '//div[@data-testid="course-card"]//a/@href' ] links = set() for xpath in link_selectors: found_links = selector.xpath(xpath).getall() for link in found_links: # 转换为绝对URL absolute_url = urljoin(base_url, link) # 验证是否为课程URL if self._is_course_url(absolute_url): links.add(absolute_url) return list(links)[:20] # 限制每次爬取的数量 def _is_course_url(self, url: str) -> bool: """判断URL是否为课程详情页""" for platform, pattern in self.platform_patterns.items(): if re.match(pattern, url): return True return False def _build_pagination_url(self, base_url: str, page: int) -> str: """构建分页URL""" parsed = urlparse(base_url) # 根据不同平台构建分页URL if 'coursera' in parsed.netloc: return f"{base_url}?page={page}" elif 'udemy' in parsed.netloc: return f"{base_url}?p={page}" elif 'edx' in parsed.netloc: return f"{base_url}?page={page}" # 默认处理 return f"{base_url}?page={page}" def _has_next_page(self, html: str) -> bool: """检查是否有下一页""" selector = Selector(text=html) next_buttons = selector.xpath( '//a[contains(text(), "Next") or contains(@class, "next")]' ) return len(next_buttons) > 0 def _extract_from_json_ld(self, json_ld: Dict) -> Dict[str, Any]: """从JSON-LD提取数据""" extracted = {} mapping = { 'name': 'title', 'description': 'description', 'provider': 'instructor', 'aggregateRating.ratingValue': 'rating', 'offers.price': 'price', 'offers.priceCurrency': 'currency' } for json_key, data_key in mapping.items(): value = self._get_nested_value(json_ld, json_key) if value: extracted[data_key] = value return extracted def _get_nested_value(self, data: Dict, key_path: str): """获取嵌套字典的值""" keys = key_path.split('.') value = data for key in keys: if isinstance(value, dict): value = value.get(key) else: return None return value async def run(self, start_urls: List[str]): """运行爬虫""" async with self.crawler_engine: # 爬取每个起始URL for url in start_urls: if '/course/' in url or '/learn/' in url: # 如果是课程详情页 await self.crawl_course_page(url) else: # 如果是列表页 await self.crawl_course_list(url, max_pages=5) # 确保所有数据已保存 self.storage.close() print("爬取完成!")

7. 使用示例和主程序

python

# main.py import asyncio from main_crawler import OnlineCourseCrawler from config import CrawlerConfig async def main(): # 配置爬虫 config = CrawlerConfig( MAX_CONCURRENT_REQUESTS=5, REQUEST_TIMEOUT=30, OUTPUT_FORMAT="csv", OUTPUT_FILE="courses_data.csv", MIN_DELAY=1.0, MAX_DELAY=3.0 ) # 起始URL(示例) start_urls = [ "https://www.coursera.org/browse", "https://www.udemy.com/courses/development/", "https://www.edx.org/learn/computer-science" ] # 创建爬虫实例 crawler = OnlineCourseCrawler(config) try: # 运行爬虫 await crawler.run(start_urls) # 数据分析和报告 await generate_report() except KeyboardInterrupt: print("\n爬虫被用户中断") except Exception as e: print(f"爬虫运行出错: {str(e)}") async def generate_report(): """生成爬取报告""" import pandas as pd import matplotlib.pyplot as plt try: df = pd.read_csv("courses_data.csv") print("=" * 50) print("爬取数据统计报告") print("=" * 50) # 基本统计 print(f"总课程数: {len(df)}") print(f"平台分布:") print(df['platform'].value_counts()) # 价格分析 print(f"\n价格统计:") print(f"平均价格: ${df['price'].mean():.2f}") print(f"最高价格: ${df['price'].max():.2f}") print(f"最低价格: ${df['price'].min():.2f}") # 评分分析 print(f"\n评分统计:") print(f"平均评分: {df['rating'].mean():.2f}/5") # 保存图表 plt.figure(figsize=(12, 6)) # 平台分布饼图 plt.subplot(1, 2, 1) df['platform'].value_counts().plot.pie(autopct='%1.1f%%') plt.title('课程平台分布') # 价格分布直方图 plt.subplot(1, 2, 2) df['price'].hist(bins=20, edgecolor='black') plt.title('课程价格分布') plt.xlabel('价格($)') plt.ylabel('课程数量') plt.tight_layout() plt.savefig('course_analysis.png', dpi=300) plt.close() print("\n分析报告已生成: course_analysis.png") except FileNotFoundError: print("未找到数据文件,请先运行爬虫") except Exception as e: print(f"生成报告时出错: {str(e)}") if __name__ == "__main__": # 设置事件循环策略(Windows系统需要) try: asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) except AttributeError: pass # 运行主程序 asyncio.run(main())

8. 进阶功能:API接口和分布式扩展

python

# api_service.py from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Optional import asyncio app = FastAPI(title="在线课程爬虫API") class CrawlRequest(BaseModel): urls: List[str] max_pages: int = 5 output_format: str = "json" class CrawlResponse(BaseModel): job_id: str status: str message: str class CourseData(BaseModel): title: str platform: str price: float rating: float url: str # 这里可以添加API端点用于控制爬虫 # 限于篇幅,不展开完整实现 # 分布式扩展提示: # 1. 使用Redis作为任务队列 # 2. 使用Celery或RQ进行任务分发 # 3. 使用Docker容器化部署 # 4. 使用Kubernetes进行集群管理

爬虫最佳实践和注意事项

1. 遵守Robots协议

python

# robots_checker.py import urllib.robotparser def check_robots_permission(url: str, user_agent: str = "*") -> bool: """检查Robots协议许可""" parsed = urlparse(url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" rp = urllib.robotparser.RobotFileParser() rp.set_url(robots_url) rp.read() return rp.can_fetch(user_agent, url)

2. 伦理和法律考虑

  • 仅爬取公开可访问的数据

  • 尊重网站的服务条款

  • 不要对服务器造成过大负担

  • 遵守数据保护法规(如GDPR)

3. 性能优化建议

  • 使用连接池复用HTTP连接

  • 实现增量爬取,避免重复爬取

  • 使用Bloom Filter检查URL是否已访问

  • 实现断点续爬功能

总结

本文详细介绍了如何使用现代Python技术构建一个功能完善的在线课程信息爬虫。我们涵盖了从基础请求到高级JavaScript渲染,从数据解析到存储分析的完整流程。关键技术点包括:

  1. 异步并发编程:显著提高爬取效率

  2. 智能解析策略:多种解析方法结合,提高成功率

  3. 反反爬虫措施:合理使用延迟、代理和浏览器自动化

  4. 结构化存储:支持多种输出格式

  5. 完整监控体系:日志记录和错误处理

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询