diff --git a/crawler/crawler_engine.py b/crawler/crawler_engine.py index 29433fd..5dc2be5 100644 --- a/crawler/crawler_engine.py +++ b/crawler/crawler_engine.py @@ -148,15 +148,29 @@ class WebsiteCrawler: return links def check_keyword_match(self, text, title): - """检查关键字匹配""" + """检查关键字匹配 - 支持通配符的模糊匹配""" matched_keywords = [] text_lower = text.lower() title_lower = title.lower() for keyword in self.keywords: - keyword_lower = keyword.lower() - if keyword_lower in text_lower or keyword_lower in title_lower: - matched_keywords.append(keyword) + # 处理通配符匹配 + import re + + # 转义特殊字符并转换 * 为 .* 正则表达式 + escaped_keyword = re.escape(keyword) + regex_pattern = escaped_keyword.replace(r'\*', '.*') + + # 编译正则表达式,支持不区分大小写的匹配 + try: + pattern = re.compile(regex_pattern, re.IGNORECASE) + if pattern.search(text_lower) or pattern.search(title_lower): + matched_keywords.append(keyword) + except re.error: + # 如果正则表达式编译失败,回退到简单匹配 + keyword_lower = keyword.lower() + if keyword_lower in text_lower or keyword_lower in title_lower: + matched_keywords.append(keyword) return matched_keywords @@ -200,23 +214,131 @@ class WebsiteCrawler: '.publish-date', 'time[datetime]', '.article-time', - '.news-time' + '.news-time', + '.post-time', + '.create-time', + '.update-time', + '.time span', + '.date span', + '.info span', # 一些网站使用.info类包含发布信息 + '.meta span', + '.meta-info', + '.article-info span', + '.news-info span', + '.content-info span', + '.a-shijian', # 上海纪检监察网站的发布时间类 + '.l-time' # 天津纪检监察网站的发布时间类 ] for selector in date_selectors: - element = soup.select_one(selector) - if element: + elements = soup.select(selector) + for element in elements: date_text = element.get_text().strip() if element.get('datetime'): date_text = element.get('datetime') - + + # 如果文本太短或为空,跳过 + if not date_text or len(date_text) < 4: + continue + # 尝试解析日期 try: from datetime import datetime - # 这里可以添加更复杂的日期解析逻辑 - # 暂时返回当前时间 - return timezone.now() - except: + import re + + # 清理日期文本,移除常见的无关字符 + date_text = re.sub(r'发布(时间|日期)[::]?', '', date_text).strip() + date_text = re.sub(r'时间[::]?', '', date_text).strip() + date_text = re.sub(r'日期[::]?', '', date_text).strip() + date_text = re.sub(r'发表于[::]?', '', date_text).strip() + date_text = re.sub(r'更新[::]?', '', date_text).strip() + date_text = re.sub(r'\s+', ' ', date_text).strip() # 替换多个空白字符为单个空格 + + # 如果有 datetime 属性且是标准格式,直接使用 + if element.get('datetime'): + datetime_attr = element.get('datetime') + # 尝试解析常见的日期时间格式 + for fmt in [ + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S%z', + '%Y-%m-%d %H:%M', + '%Y-%m-%d', + '%Y/%m/%d %H:%M:%S', + '%Y/%m/%d %H:%M', + '%Y/%m/%d', + '%Y年%m月%d日 %H:%M:%S', + '%Y年%m月%d日 %H:%M', + '%Y年%m月%d日', + '%m/%d/%Y %H:%M:%S', + '%m/%d/%Y %H:%M', + '%m/%d/%Y', + '%d/%m/%Y %H:%M:%S', + '%d/%m/%Y %H:%M', + '%d/%m/%Y', + '%d.%m.%Y %H:%M:%S', + '%d.%m.%Y %H:%M', + '%d.%m.%Y' + ]: + try: + if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'): + datetime_attr = datetime_attr[:-1] + '+0000' + parsed_date = datetime.strptime(datetime_attr, fmt) + if not timezone.is_aware(parsed_date): + parsed_date = timezone.make_aware(parsed_date) + return parsed_date + except ValueError: + continue + + # 尝试解析从文本中提取的日期 + # 尝试解析各种常见的中文日期格式 + for fmt in [ + '%Y年%m月%d日 %H:%M:%S', + '%Y年%m月%d日 %H:%M', + '%Y年%m月%d日', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M', + '%Y-%m-%d', + '%Y/%m/%d %H:%M:%S', + '%Y/%m/%d %H:%M', + '%Y/%m/%d', + '%m月%d日 %H:%M', + '%m月%d日', + '%m/%d/%Y %H:%M:%S', + '%m/%d/%Y %H:%M', + '%m/%d/%Y', + '%d/%m/%Y %H:%M:%S', + '%d/%m/%Y %H:%M', + '%d/%m/%Y', + '%d.%m.%Y %H:%M:%S', + '%d.%m.%Y %H:%M', + '%d.%m.%Y' + ]: + try: + parsed_date = datetime.strptime(date_text, fmt) + # 如果没有年份,使用当前年份 + if '%Y' not in fmt: + parsed_date = parsed_date.replace(year=datetime.now().year) + if not timezone.is_aware(parsed_date): + parsed_date = timezone.make_aware(parsed_date) + return parsed_date + except ValueError: + continue + + # 如果以上格式都不匹配,尝试使用 dateutil 解析 + try: + from dateutil import parser + # 过滤掉明显不是日期的文本 + if len(date_text) > 5 and not date_text.isdigit(): + parsed_date = parser.parse(date_text) + if not timezone.is_aware(parsed_date): + parsed_date = timezone.make_aware(parsed_date) + return parsed_date + except: + pass + + except Exception as e: + self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}') continue return None diff --git a/crawler/templates/crawler/dashboard.html b/crawler/templates/crawler/dashboard.html index c356159..abc9cdb 100644 --- a/crawler/templates/crawler/dashboard.html +++ b/crawler/templates/crawler/dashboard.html @@ -84,67 +84,58 @@
-
-
-
- - - - - -
+
+
+ +
-
- - {% if stats.page_obj.has_other_pages %} - - {% endif %} + +
+ +
-
+ +
+ + +
+ +
+ + +
+ +
+
+ + + 清除 + +
+
+
@@ -240,7 +231,7 @@