Add support *

This commit is contained in:
2025-09-26 15:55:48 +08:00
parent f15b730dca
commit 181a78ad44
5 changed files with 1004 additions and 100 deletions

View File

@@ -148,15 +148,29 @@ class WebsiteCrawler:
return links
def check_keyword_match(self, text, title):
"""检查关键字匹配"""
"""检查关键字匹配 - 支持通配符的模糊匹配"""
matched_keywords = []
text_lower = text.lower()
title_lower = title.lower()
for keyword in self.keywords:
keyword_lower = keyword.lower()
if keyword_lower in text_lower or keyword_lower in title_lower:
matched_keywords.append(keyword)
# 处理通配符匹配
import re
# 转义特殊字符并转换 * 为 .* 正则表达式
escaped_keyword = re.escape(keyword)
regex_pattern = escaped_keyword.replace(r'\*', '.*')
# 编译正则表达式,支持不区分大小写的匹配
try:
pattern = re.compile(regex_pattern, re.IGNORECASE)
if pattern.search(text_lower) or pattern.search(title_lower):
matched_keywords.append(keyword)
except re.error:
# 如果正则表达式编译失败,回退到简单匹配
keyword_lower = keyword.lower()
if keyword_lower in text_lower or keyword_lower in title_lower:
matched_keywords.append(keyword)
return matched_keywords
@@ -200,23 +214,131 @@ class WebsiteCrawler:
'.publish-date',
'time[datetime]',
'.article-time',
'.news-time'
'.news-time',
'.post-time',
'.create-time',
'.update-time',
'.time span',
'.date span',
'.info span', # 一些网站使用.info类包含发布信息
'.meta span',
'.meta-info',
'.article-info span',
'.news-info span',
'.content-info span',
'.a-shijian', # 上海纪检监察网站的发布时间类
'.l-time' # 天津纪检监察网站的发布时间类
]
for selector in date_selectors:
element = soup.select_one(selector)
if element:
elements = soup.select(selector)
for element in elements:
date_text = element.get_text().strip()
if element.get('datetime'):
date_text = element.get('datetime')
# 如果文本太短或为空,跳过
if not date_text or len(date_text) < 4:
continue
# 尝试解析日期
try:
from datetime import datetime
# 这里可以添加更复杂的日期解析逻辑
# 暂时返回当前时间
return timezone.now()
except:
import re
# 清理日期文本,移除常见的无关字符
date_text = re.sub(r'发布(时间|日期)[:]?', '', date_text).strip()
date_text = re.sub(r'时间[:]?', '', date_text).strip()
date_text = re.sub(r'日期[:]?', '', date_text).strip()
date_text = re.sub(r'发表于[:]?', '', date_text).strip()
date_text = re.sub(r'更新[:]?', '', date_text).strip()
date_text = re.sub(r'\s+', ' ', date_text).strip() # 替换多个空白字符为单个空格
# 如果有 datetime 属性且是标准格式,直接使用
if element.get('datetime'):
datetime_attr = element.get('datetime')
# 尝试解析常见的日期时间格式
for fmt in [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%m/%d/%Y %H:%M:%S',
'%m/%d/%Y %H:%M',
'%m/%d/%Y',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
'%d/%m/%Y',
'%d.%m.%Y %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y'
]:
try:
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
datetime_attr = datetime_attr[:-1] + '+0000'
parsed_date = datetime.strptime(datetime_attr, fmt)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 尝试解析从文本中提取的日期
# 尝试解析各种常见的中文日期格式
for fmt in [
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%m月%d%H:%M',
'%m月%d',
'%m/%d/%Y %H:%M:%S',
'%m/%d/%Y %H:%M',
'%m/%d/%Y',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
'%d/%m/%Y',
'%d.%m.%Y %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y'
]:
try:
parsed_date = datetime.strptime(date_text, fmt)
# 如果没有年份,使用当前年份
if '%Y' not in fmt:
parsed_date = parsed_date.replace(year=datetime.now().year)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 如果以上格式都不匹配,尝试使用 dateutil 解析
try:
from dateutil import parser
# 过滤掉明显不是日期的文本
if len(date_text) > 5 and not date_text.isdigit():
parsed_date = parser.parse(date_text)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except:
pass
except Exception as e:
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
continue
return None