""" 关键词爬虫引擎 基于 crawler_engine.py 的关键词爬取方法改进 """ import requests import time import re import logging import os import urllib3 from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from django.conf import settings from django.utils import timezone from django.core.files.base import ContentFile from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from .models import Website, CrawlTask, Article from .utils import get_page_with_selenium, get_page_with_requests, check_keyword_in_content # 禁用SSL警告 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # 设置日志记录器 logger = logging.getLogger(__name__) class KeywordCrawler: """关键词爬虫引擎""" def __init__(self, task_id, task_executor_instance=None): self.task = CrawlTask.objects.get(id=task_id) self.task_id = task_id self.task_executor = task_executor_instance self.keywords = [kw.strip() for kw in self.task.keyword.split(',') if kw.strip()] if self.task.keyword else [] # 创建带重试策略的会话 self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) # 设置重试策略 retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], ) adapter = HTTPAdapter(max_retries=retry_strategy) self.session.mount("http://", adapter) self.session.mount("https://", adapter) # 设置超时 self.timeout = 15 def log(self, level, message, website=None): """记录日志""" print(f"[{level.upper()}] {message}") logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}") def is_cancelled(self): """检查任务是否已被取消""" if self.task_executor: return self.task_executor.is_task_cancelled(self.task_id) return False def update_task_status(self, status, **kwargs): """更新任务状态""" self.task.status = status if status == 'running' and not self.task.started_at: self.task.started_at = timezone.now() elif status in ['completed', 'failed', 'cancelled']: self.task.completed_at = timezone.now() for key, value in kwargs.items(): setattr(self.task, key, value) self.task.save() def extract_text_content(self, soup): """提取文本内容,保持段落结构""" # 移除脚本和样式标签 for script in soup(["script", "style"]): script.decompose() # 处理段落标签,保持段落结构 paragraphs = [] # 查找所有段落相关的标签 for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']): if element.name in ['p', 'div']: text = element.get_text().strip() if text: paragraphs.append(text) elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: text = element.get_text().strip() if text: paragraphs.append(f"\n{text}\n") # 标题前后加换行 elif element.name == 'br': paragraphs.append('\n') # 如果没有找到段落标签,使用原来的方法 if not paragraphs: text = soup.get_text() # 清理文本但保持换行 lines = [] for line in text.splitlines(): line = line.strip() if line: lines.append(line) return '\n\n'.join(lines) # 合并段落,用双换行分隔 content = '\n\n'.join(paragraphs) # 清理多余的空行 content = re.sub(r'\n\s*\n\s*\n', '\n\n', content) return content.strip() def clean_url(self, url): """清理和修复URL""" try: # 处理空值或None if not url or url is None: return "" # 修复常见的URL问题 # 将错误的编码字符恢复 url = str(url).replace('%C3%97', '×') # 修复 × 字符的错误编码 url = url.replace('%E2%80%93', '–') # 修复 – 字符的错误编码 url = url.replace('%E2%80%94', '—') # 修复 — 字符的错误编码 # 解析URL并重新构建 parsed = urlparse(url) # 清理查询参数 if parsed.query: # 处理查询参数中的编码问题 from urllib.parse import parse_qs, urlencode, unquote query_params = parse_qs(parsed.query) cleaned_params = {} for key, values in query_params.items(): # 解码参数名 clean_key = unquote(key) # 解码参数值 clean_values = [unquote(val) for val in values] cleaned_params[clean_key] = clean_values # 重新构建查询字符串 query_string = urlencode(cleaned_params, doseq=True) else: query_string = '' # 重新构建URL clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" if query_string: clean_url += f"?{query_string}" if parsed.fragment: clean_url += f"#{parsed.fragment}" return clean_url except Exception as e: self.log('warning', f'URL清理失败: {url}, 错误: {e}') return url def is_valid_article_url(self, url): """检查是否是有效的文章URL""" try: # 排除一些明显不是文章的URL exclude_patterns = [ 'javascript:', 'mailto:', '#', 'tel:', '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.mp3', '.mp4', '.avi', '.mov' ] url_lower = url.lower() for pattern in exclude_patterns: if pattern in url_lower: return False # 检查URL长度 if len(url) < 10: return False # 检查是否包含文章相关的关键词 article_keywords = ['article', 'news', 'content', 'detail', 'view', 'show', 'post'] url_lower = url.lower() for keyword in article_keywords: if keyword in url_lower: return True # 如果URL看起来像文章ID或路径,也认为是有效的 if any(char.isdigit() for char in url) and len(url.split('/')) > 3: return True return False except Exception: return False def find_article_links(self, soup, base_url): """查找文章链接""" links = [] seen_urls = set() # 避免重复URL # 常见的文章链接选择器 selectors = [ 'a[href*="article"]', 'a[href*="news"]', 'a[href*="content"]', 'a[href*="detail"]', 'a[href*="view"]', 'a[href*="show"]', '.news-list a', '.article-list a', '.content-list a', 'h3 a', 'h4 a', '.title a', '.list-item a' ] for selector in selectors: elements = soup.select(selector) for element in elements: href = element.get('href') if href: # 清理和修复URL clean_href = self.clean_url(href) full_url = urljoin(base_url, clean_href) # 再次清理完整URL full_url = self.clean_url(full_url) # 检查URL是否有效且未重复 if (full_url not in seen_urls and self.is_valid_article_url(full_url) and full_url.startswith(('http://', 'https://'))): title = element.get_text().strip() if title and len(title) > 5: # 过滤掉太短的标题 links.append({ 'url': full_url, 'title': title }) seen_urls.add(full_url) return links def check_keyword_match(self, text, title): """检查关键字匹配 - 改进版本""" matched_keywords = [] text_lower = text.lower() title_lower = title.lower() for keyword in self.keywords: keyword_lower = keyword.lower() # 使用改进的关键字检查函数 if check_keyword_in_content(text, keyword) or check_keyword_in_content(title, keyword): matched_keywords.append(keyword) return matched_keywords def extract_article_content(self, url, soup): """提取文章内容""" # 尝试多种内容选择器 content_selectors = [ '.article-content', '.content', '.article-body', '.news-content', '.main-content', '.post-content', 'article', '.detail-content', '#content', '.text', '.box_con', # 新华网等网站使用 '.content_area', # 央视网等网站使用 ] content = "" for selector in content_selectors: element = soup.select_one(selector) if element: content = self.extract_text_content(element) if len(content) > 100: # 确保内容足够长 break # 如果没找到特定内容区域,使用整个页面 if not content or len(content) < 100: content = self.extract_text_content(soup) return content def extract_publish_date(self, soup): """提取发布时间""" date_selectors = [ '.publish-time', '.pub-time', '.date', '.time', '.publish-date', 'time[datetime]', '.article-time', '.news-time', '.post-time', '.create-time', '.update-time', '.time span', '.date span', '.info span', '.meta span', '.meta-info', '.article-info span', '.news-info span', '.content-info span', '.a-shijian', '.l-time' ] for selector in date_selectors: elements = soup.select(selector) for element in elements: date_text = element.get_text().strip() if element.get('datetime'): date_text = element.get('datetime') # 如果文本太短或为空,跳过 if not date_text or len(date_text) < 4: continue # 尝试解析日期 try: from datetime import datetime # 清理日期文本 date_text = re.sub(r'发布(时间|日期)[::]?', '', date_text).strip() date_text = re.sub(r'时间[::]?', '', date_text).strip() date_text = re.sub(r'日期[::]?', '', date_text).strip() date_text = re.sub(r'发表于[::]?', '', date_text).strip() date_text = re.sub(r'更新[::]?', '', date_text).strip() date_text = re.sub(r'\s+', ' ', date_text).strip() # 如果有 datetime 属性且是标准格式,直接使用 if element.get('datetime'): datetime_attr = element.get('datetime') # 尝试解析常见的日期时间格式 for fmt in [ '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%d %H:%M', '%Y-%m-%d', '%Y/%m/%d %H:%M:%S', '%Y/%m/%d %H:%M', '%Y/%m/%d', '%Y年%m月%d日 %H:%M:%S', '%Y年%m月%d日 %H:%M', '%Y年%m月%d日', ]: try: if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'): datetime_attr = datetime_attr[:-1] + '+0000' parsed_date = datetime.strptime(datetime_attr, fmt) if not timezone.is_aware(parsed_date): parsed_date = timezone.make_aware(parsed_date) return parsed_date except ValueError: continue # 尝试解析从文本中提取的日期 for fmt in [ '%Y年%m月%d日 %H:%M:%S', '%Y年%m月%d日 %H:%M', '%Y年%m月%d日', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', '%Y-%m-%d', '%Y/%m/%d %H:%M:%S', '%Y/%m/%d %H:%M', '%Y/%m/%d', '%m月%d日 %H:%M', '%m月%d日', ]: try: parsed_date = datetime.strptime(date_text, fmt) # 如果没有年份,使用当前年份 if '%Y' not in fmt: parsed_date = parsed_date.replace(year=datetime.now().year) if not timezone.is_aware(parsed_date): parsed_date = timezone.make_aware(parsed_date) return parsed_date except ValueError: continue # 如果以上格式都不匹配,尝试使用 dateutil 解析 try: from dateutil import parser if len(date_text) > 5 and not date_text.isdigit(): parsed_date = parser.parse(date_text) if not timezone.is_aware(parsed_date): parsed_date = timezone.make_aware(parsed_date) return parsed_date except: pass except Exception as e: self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}') continue return None def extract_author(self, soup): """提取作者信息""" author_selectors = [ '.author', '.writer', '.publisher', '.byline', '.article-author', '.news-author', '.source' ] for selector in author_selectors: element = soup.select_one(selector) if element: return element.get_text().strip() return "" def download_media_file(self, media_url, article, media_type='image', alt_text=''): """下载媒体文件 - 适配现有模型结构""" try: # 检查URL是否有效 if not media_url or not media_url.startswith(('http://', 'https://')): return None # 请求媒体文件 response = self.session.get( media_url, timeout=self.timeout, verify=False, stream=False ) response.raise_for_status() # 获取文件信息 content_type = response.headers.get('content-type', '') file_size = len(response.content) # 确定文件扩展名 file_extension = self.get_file_extension_from_url(media_url, content_type) # 生成文件名 existing_media_count = len(article.media_files) if article.media_files else 0 filename = f"media_{article.id}_{existing_media_count}{file_extension}" # 创建媒体文件信息字典 media_info = { 'type': media_type, 'original_url': media_url, 'filename': filename, 'file_size': file_size, 'mime_type': content_type, 'alt_text': alt_text, 'downloaded_at': timezone.now().isoformat() } # 更新文章的媒体文件列表 if not article.media_files: article.media_files = [media_info] else: article.media_files.append(media_info) # 保存文件到本地(这里简化处理,实际项目中可能需要更复杂的文件存储) self.log('info', f'媒体文件已记录: {filename} ({media_type})') return media_info except Exception as e: self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}') return None def get_file_extension_from_url(self, url, content_type): """从URL或内容类型获取文件扩展名""" # 从URL获取扩展名 parsed_url = urlparse(url) path = parsed_url.path if '.' in path: return os.path.splitext(path)[1] # 从内容类型获取扩展名 content_type_map = { 'image/jpeg': '.jpg', 'image/jpg': '.jpg', 'image/png': '.png', 'image/gif': '.gif', 'image/webp': '.webp', 'image/svg+xml': '.svg', 'video/mp4': '.mp4', 'video/avi': '.avi', 'video/mov': '.mov', 'video/wmv': '.wmv', 'video/flv': '.flv', 'video/webm': '.webm', 'audio/mp3': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg', 'application/pdf': '.pdf', 'application/msword': '.doc', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', } return content_type_map.get(content_type.lower(), '.bin') def extract_and_download_media(self, soup, article, base_url): """提取并下载页面中的媒体文件""" media_files = [] # 提取图片 images = soup.find_all('img') self.log('info', f'找到 {len(images)} 个图片标签') for img in images: src = img.get('src') if src: # 处理相对URL if src.startswith('//'): src = 'https:' + src elif src.startswith('/'): src = urljoin(base_url, src) elif not src.startswith(('http://', 'https://')): src = urljoin(base_url, src) alt_text = img.get('alt', '') media_file = self.download_media_file(src, article, 'image', alt_text) if media_file: media_files.append(media_file) # 提取视频 videos = soup.find_all(['video', 'source']) for video in videos: src = video.get('src') if src: # 处理相对URL if src.startswith('//'): src = 'https:' + src elif src.startswith('/'): src = urljoin(base_url, src) elif not src.startswith(('http://', 'https://')): src = urljoin(base_url, src) media_file = self.download_media_file(src, article, 'video') if media_file: media_files.append(media_file) return media_files def crawl_website(self, website): """爬取单个网站""" self.log('info', f'开始爬取网站: {website.name}') try: # 请求主页 response = self.session.get( website.base_url, timeout=self.timeout, verify=False ) response.raise_for_status() # 检查内容编码 if response.encoding != 'utf-8': content_type = response.headers.get('content-type', '') if 'charset=' in content_type: charset = content_type.split('charset=')[-1] response.encoding = charset else: response.encoding = 'utf-8' soup = BeautifulSoup(response.content, 'html.parser') # 查找文章链接 article_links = self.find_article_links(soup, website.base_url) self.log('info', f'找到 {len(article_links)} 个文章链接') crawled_count = 0 for link_info in article_links: # 检查任务是否已被取消 if self.is_cancelled(): self.log('info', '任务已被取消,停止处理文章') return crawled_count try: # 清理和验证URL clean_url = self.clean_url(link_info['url']) # 检查URL是否仍然有效 if not self.is_valid_article_url(clean_url): self.log('warning', f'跳过无效URL: {clean_url}') continue self.log('info', f'正在处理文章: {clean_url}') # 请求文章页面 article_response = self.session.get( clean_url, timeout=self.timeout, verify=False ) article_response.raise_for_status() # 检查内容编码 if article_response.encoding != 'utf-8': content_type = article_response.headers.get('content-type', '') if 'charset=' in content_type: charset = content_type.split('charset=')[-1] article_response.encoding = charset else: article_response.encoding = 'utf-8' article_soup = BeautifulSoup(article_response.content, 'html.parser') # 提取内容 content = self.extract_article_content(clean_url, article_soup) title = link_info['title'] # 检查关键字匹配 matched_keywords = self.check_keyword_match(content, title) if matched_keywords: # 提取其他信息 publish_date = self.extract_publish_date(article_soup) author = self.extract_author(article_soup) # 检查是否已存在相同URL的文章 existing_article = Article.objects.filter( url=clean_url ).first() if existing_article: # 如果已存在,更新现有记录 existing_article.title = title existing_article.content = content existing_article.pub_date = publish_date existing_article.media_files = [] # 重置媒体文件列表 existing_article.save() # 更新媒体文件 media_files = self.extract_and_download_media(article_soup, existing_article, clean_url) self.log('info', f'更新已存在的文章: {title[:50]}...') else: # 保存新内容 article = Article.objects.create( website=website, title=title, content=content, url=clean_url, pub_date=publish_date, media_files=[] ) # 提取并下载媒体文件 media_files = self.extract_and_download_media(article_soup, article, clean_url) self.log('info', f'保存新文章: {title[:50]}...') crawled_count += 1 # 请求间隔 time.sleep(1) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: self.log('warning', f'文章不存在 (404): {clean_url}') elif e.response.status_code == 403: self.log('warning', f'访问被拒绝 (403): {clean_url}') elif e.response.status_code == 429: self.log('warning', f'请求过于频繁 (429): {clean_url}') time.sleep(5) # 等待更长时间 else: self.log('error', f'HTTP错误 {e.response.status_code}: {clean_url}') continue except requests.exceptions.Timeout as e: self.log('warning', f'请求超时: {clean_url}') continue except requests.exceptions.ConnectionError as e: self.log('warning', f'连接错误: {clean_url}') continue except Exception as e: self.log('error', f'处理文章失败 {clean_url}: {str(e)}') continue self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章') return crawled_count except Exception as e: self.log('error', f'爬取网站失败: {str(e)}') return 0 def run(self): """运行爬取任务""" self.log('info', f'开始执行关键词爬取任务: {self.task.name}') self.update_task_status('running') total_crawled = 0 websites = self.task.websites.all() self.task.total_pages = websites.count() self.task.save() for website in websites: # 检查任务是否已被取消 if self.is_cancelled(): self.log('info', '任务已被取消,停止爬取') self.update_task_status('cancelled', error_message='任务被取消') return try: crawled_count = self.crawl_website(website) total_crawled += crawled_count self.task.crawled_pages += 1 self.task.save() # 再次检查任务是否已被取消 if self.is_cancelled(): self.log('info', '任务已被取消,停止爬取') self.update_task_status('cancelled', error_message='任务被取消') return except Exception as e: self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}') continue # 更新任务状态 if total_crawled > 0: self.update_task_status('completed') self.log('info', f'关键词爬取任务完成,共爬取 {total_crawled} 篇文章') else: self.update_task_status('failed', error_message='没有找到匹配的内容') self.log('error', '关键词爬取任务失败,没有找到匹配的内容') def run_keyword_crawl_task(task_id, task_executor_instance=None): """运行关键词爬取任务""" try: crawler = KeywordCrawler(task_id, task_executor_instance) crawler.run() return f"关键词爬取任务 {task_id} 执行完成" except Exception as e: # 记录异常到日志 logger.error(f"执行关键词爬取任务 {task_id} 时发生异常: {str(e)}", exc_info=True) task = CrawlTask.objects.get(id=task_id) task.status = 'failed' task.error_message = str(e) task.completed_at = timezone.now() task.save() return f"关键词爬取任务 {task_id} 执行失败: {str(e)}"