Add support *
This commit is contained in:
@@ -148,15 +148,29 @@ class WebsiteCrawler:
|
||||
return links
|
||||
|
||||
def check_keyword_match(self, text, title):
|
||||
"""检查关键字匹配"""
|
||||
"""检查关键字匹配 - 支持通配符的模糊匹配"""
|
||||
matched_keywords = []
|
||||
text_lower = text.lower()
|
||||
title_lower = title.lower()
|
||||
|
||||
for keyword in self.keywords:
|
||||
keyword_lower = keyword.lower()
|
||||
if keyword_lower in text_lower or keyword_lower in title_lower:
|
||||
matched_keywords.append(keyword)
|
||||
# 处理通配符匹配
|
||||
import re
|
||||
|
||||
# 转义特殊字符并转换 * 为 .* 正则表达式
|
||||
escaped_keyword = re.escape(keyword)
|
||||
regex_pattern = escaped_keyword.replace(r'\*', '.*')
|
||||
|
||||
# 编译正则表达式,支持不区分大小写的匹配
|
||||
try:
|
||||
pattern = re.compile(regex_pattern, re.IGNORECASE)
|
||||
if pattern.search(text_lower) or pattern.search(title_lower):
|
||||
matched_keywords.append(keyword)
|
||||
except re.error:
|
||||
# 如果正则表达式编译失败,回退到简单匹配
|
||||
keyword_lower = keyword.lower()
|
||||
if keyword_lower in text_lower or keyword_lower in title_lower:
|
||||
matched_keywords.append(keyword)
|
||||
|
||||
return matched_keywords
|
||||
|
||||
@@ -200,23 +214,131 @@ class WebsiteCrawler:
|
||||
'.publish-date',
|
||||
'time[datetime]',
|
||||
'.article-time',
|
||||
'.news-time'
|
||||
'.news-time',
|
||||
'.post-time',
|
||||
'.create-time',
|
||||
'.update-time',
|
||||
'.time span',
|
||||
'.date span',
|
||||
'.info span', # 一些网站使用.info类包含发布信息
|
||||
'.meta span',
|
||||
'.meta-info',
|
||||
'.article-info span',
|
||||
'.news-info span',
|
||||
'.content-info span',
|
||||
'.a-shijian', # 上海纪检监察网站的发布时间类
|
||||
'.l-time' # 天津纪检监察网站的发布时间类
|
||||
]
|
||||
|
||||
for selector in date_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
date_text = element.get_text().strip()
|
||||
if element.get('datetime'):
|
||||
date_text = element.get('datetime')
|
||||
|
||||
|
||||
# 如果文本太短或为空,跳过
|
||||
if not date_text or len(date_text) < 4:
|
||||
continue
|
||||
|
||||
# 尝试解析日期
|
||||
try:
|
||||
from datetime import datetime
|
||||
# 这里可以添加更复杂的日期解析逻辑
|
||||
# 暂时返回当前时间
|
||||
return timezone.now()
|
||||
except:
|
||||
import re
|
||||
|
||||
# 清理日期文本,移除常见的无关字符
|
||||
date_text = re.sub(r'发布(时间|日期)[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'时间[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'日期[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'发表于[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'更新[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'\s+', ' ', date_text).strip() # 替换多个空白字符为单个空格
|
||||
|
||||
# 如果有 datetime 属性且是标准格式,直接使用
|
||||
if element.get('datetime'):
|
||||
datetime_attr = element.get('datetime')
|
||||
# 尝试解析常见的日期时间格式
|
||||
for fmt in [
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S%z',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y/%m/%d %H:%M',
|
||||
'%Y/%m/%d',
|
||||
'%Y年%m月%d日 %H:%M:%S',
|
||||
'%Y年%m月%d日 %H:%M',
|
||||
'%Y年%m月%d日',
|
||||
'%m/%d/%Y %H:%M:%S',
|
||||
'%m/%d/%Y %H:%M',
|
||||
'%m/%d/%Y',
|
||||
'%d/%m/%Y %H:%M:%S',
|
||||
'%d/%m/%Y %H:%M',
|
||||
'%d/%m/%Y',
|
||||
'%d.%m.%Y %H:%M:%S',
|
||||
'%d.%m.%Y %H:%M',
|
||||
'%d.%m.%Y'
|
||||
]:
|
||||
try:
|
||||
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
|
||||
datetime_attr = datetime_attr[:-1] + '+0000'
|
||||
parsed_date = datetime.strptime(datetime_attr, fmt)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# 尝试解析从文本中提取的日期
|
||||
# 尝试解析各种常见的中文日期格式
|
||||
for fmt in [
|
||||
'%Y年%m月%d日 %H:%M:%S',
|
||||
'%Y年%m月%d日 %H:%M',
|
||||
'%Y年%m月%d日',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y/%m/%d %H:%M',
|
||||
'%Y/%m/%d',
|
||||
'%m月%d日 %H:%M',
|
||||
'%m月%d日',
|
||||
'%m/%d/%Y %H:%M:%S',
|
||||
'%m/%d/%Y %H:%M',
|
||||
'%m/%d/%Y',
|
||||
'%d/%m/%Y %H:%M:%S',
|
||||
'%d/%m/%Y %H:%M',
|
||||
'%d/%m/%Y',
|
||||
'%d.%m.%Y %H:%M:%S',
|
||||
'%d.%m.%Y %H:%M',
|
||||
'%d.%m.%Y'
|
||||
]:
|
||||
try:
|
||||
parsed_date = datetime.strptime(date_text, fmt)
|
||||
# 如果没有年份,使用当前年份
|
||||
if '%Y' not in fmt:
|
||||
parsed_date = parsed_date.replace(year=datetime.now().year)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# 如果以上格式都不匹配,尝试使用 dateutil 解析
|
||||
try:
|
||||
from dateutil import parser
|
||||
# 过滤掉明显不是日期的文本
|
||||
if len(date_text) > 5 and not date_text.isdigit():
|
||||
parsed_date = parser.parse(date_text)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user