You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
530 lines
20 KiB
530 lines
20 KiB
"""
|
|
@header({
|
|
searchable: 1,
|
|
filterable: 1,
|
|
quickSearch: 1,
|
|
title: '高清电影天堂',
|
|
author: '完全修复推送版',
|
|
lang: 'hipy'
|
|
})
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import re
|
|
from urllib.parse import quote_plus, unquote
|
|
|
|
sys.path.append('..')
|
|
from base.spider import Spider
|
|
|
|
|
|
class Spider(Spider):
|
|
def getName(self):
|
|
return "高清电影天堂"
|
|
|
|
def init(self, extend=""):
|
|
self.baseUrl = "https://www.gaoqing888.com"
|
|
self.siteUrl = self.baseUrl
|
|
|
|
def homeContent(self, filter):
|
|
return {
|
|
'class': [
|
|
{'type_name': '每日更新', 'type_id': 'home'},
|
|
{'type_name': '选电影', 'type_id': 'movie'}
|
|
]
|
|
}
|
|
|
|
def homeVideoContent(self):
|
|
result = []
|
|
try:
|
|
html = self.fetch(self.baseUrl, headers=self._get_header()).text
|
|
if not html:
|
|
return {'list': result}
|
|
|
|
# 从首页提取视频
|
|
video_matches = self._parse_video_items(html)
|
|
|
|
for match in video_matches[:15]: # 只取前15个
|
|
try:
|
|
vod_id = match[0].strip()
|
|
vod_name = match[1].strip()
|
|
vod_pic = match[2].strip() if len(match) > 2 else ""
|
|
p_content = match[3] if len(match) > 3 else ""
|
|
|
|
vod_name = self._clean_text(vod_name)
|
|
|
|
# 提取评分
|
|
vod_rating = ""
|
|
if p_content:
|
|
rating_match = re.search(r'<strong[^>]*title="评分">([^<]+)</strong>', p_content, re.S)
|
|
vod_rating = rating_match.group(1).strip() if rating_match else ""
|
|
|
|
# 检查是否可播
|
|
is_playable = bool(re.search(r'playable fa fa-play-circle-o', p_content, re.S)) if p_content else False
|
|
|
|
remarks = []
|
|
if vod_rating and vod_rating not in ["0", "0.0"]:
|
|
remarks.append(f"评分:{vod_rating}")
|
|
if is_playable:
|
|
remarks.append("可播")
|
|
|
|
result.append({
|
|
"vod_id": vod_id,
|
|
"vod_name": vod_name,
|
|
"vod_pic": vod_pic,
|
|
"vod_remarks": " ".join(remarks) if remarks else ""
|
|
})
|
|
except:
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(f"homeVideoContent error: {str(e)}")
|
|
return {'list': result}
|
|
|
|
def categoryContent(self, tid, pg, filter, extend):
|
|
result = {'list': [], 'page': pg, 'pagecount': 1, 'limit': 90, 'total': 999999}
|
|
|
|
try:
|
|
# 构建URL
|
|
if tid == "home":
|
|
url = f"{self.baseUrl}/?page={pg}" if pg and int(pg) > 1 else self.baseUrl
|
|
elif tid == "movie":
|
|
url = f"{self.baseUrl}/movie?page={pg}" if pg and int(pg) > 1 else f"{self.baseUrl}/movie"
|
|
else:
|
|
url = f"{self.baseUrl}/{tid}?page={pg}" if pg and int(pg) > 1 else f"{self.baseUrl}/{tid}"
|
|
|
|
html = self.fetch(url, headers=self._get_header()).text
|
|
if not html:
|
|
return result
|
|
|
|
# 提取视频列表
|
|
video_matches = self._parse_video_items(html)
|
|
|
|
for match in video_matches:
|
|
try:
|
|
vod_id = match[0].strip()
|
|
vod_name = match[1].strip()
|
|
vod_pic = match[2].strip() if len(match) > 2 else ""
|
|
p_content = match[3] if len(match) > 3 else ""
|
|
|
|
if not vod_id or not vod_name:
|
|
continue
|
|
|
|
vod_name = self._clean_text(vod_name)
|
|
|
|
# 提取评分
|
|
vod_rating = ""
|
|
if p_content:
|
|
rating_match = re.search(r'<strong[^>]*title="评分">([^<]+)</strong>', p_content, re.S)
|
|
vod_rating = rating_match.group(1).strip() if rating_match else ""
|
|
|
|
# 检查是否可播
|
|
is_playable = bool(re.search(r'playable fa fa-play-circle-o', p_content, re.S)) if p_content else False
|
|
|
|
remarks = []
|
|
if vod_rating and vod_rating not in ["0", "0.0"]:
|
|
remarks.append(f"评分:{vod_rating}")
|
|
if is_playable:
|
|
remarks.append("可播")
|
|
|
|
result['list'].append({
|
|
"vod_id": vod_id,
|
|
"vod_name": vod_name,
|
|
"vod_pic": vod_pic,
|
|
"vod_remarks": " ".join(remarks) if remarks else ""
|
|
})
|
|
except:
|
|
continue
|
|
|
|
# 提取总页数
|
|
result['pagecount'] = self._get_page_count(html, pg)
|
|
|
|
except Exception as e:
|
|
print(f"categoryContent error: {str(e)}")
|
|
|
|
return result
|
|
|
|
def detailContent(self, ids):
|
|
if not ids:
|
|
return {'list': []}
|
|
|
|
vod_id = str(ids[0]).strip()
|
|
url = f'{self.baseUrl}/{vod_id}/detail'
|
|
|
|
try:
|
|
html = self.fetch(url, headers=self._get_header()).text
|
|
if not html:
|
|
return {'list': []}
|
|
|
|
# 提取标题
|
|
title_match = re.search(r'<h1[^>]*class="page-title"[^>]*>(.*?)</h1>', html, re.S)
|
|
if not title_match:
|
|
title_match = re.search(r'<title>(.*?)</title>', html, re.S)
|
|
if title_match:
|
|
title = title_match.group(1).strip()
|
|
title = re.sub(r'_.*|迅雷下载.*|高清下载.*|高清电影天堂.*', '', title)
|
|
else:
|
|
return {'list': []}
|
|
else:
|
|
title = title_match.group(1).strip()
|
|
|
|
title = self._clean_text(title)
|
|
|
|
# 提取年份
|
|
year_match = re.search(r'\((\d{4})\)', html)
|
|
year = year_match.group(1) if year_match else ''
|
|
|
|
# 提取封面
|
|
pic_match = re.search(r'<img[^>]*class="[^"]*cover[^"]*"[^>]*src="([^"]+)"', html, re.S) or \
|
|
re.search(r'<img[^>]*src="([^"]+)"[^>]*alt="[^"]*"[^>]*>', html, re.S)
|
|
pic = pic_match.group(1).strip() if pic_match else ''
|
|
|
|
# 提取描述
|
|
desc = self._extract_description(html)
|
|
|
|
# 提取基本信息
|
|
info = self._extract_video_info(html)
|
|
|
|
# 提取播放资源
|
|
play_lines = self._extract_play_resources(html)
|
|
|
|
if not play_lines:
|
|
play_lines = ["暂无资源$暂无资源"]
|
|
|
|
# 播放来源
|
|
play_from = []
|
|
if any("夸克网盘" in line for line in play_lines):
|
|
play_from.append("夸克网盘")
|
|
if any("磁力链接" in line for line in play_lines):
|
|
play_from.append("磁力链接")
|
|
if not play_from:
|
|
play_from = ["其他资源"]
|
|
|
|
vod_info = {
|
|
"vod_id": vod_id,
|
|
"vod_name": title,
|
|
"vod_pic": pic,
|
|
"type_name": info.get('type', ''),
|
|
"vod_year": info.get('year', year),
|
|
"vod_area": info.get('area', ''),
|
|
"vod_remarks": info.get('remarks', ''),
|
|
"vod_actor": info.get('actor', ''),
|
|
"vod_director": info.get('director', ''),
|
|
"vod_content": desc,
|
|
"vod_play_from": "$$$".join(play_from),
|
|
"vod_play_url": "#".join(play_lines)
|
|
}
|
|
|
|
return {'list': [vod_info]}
|
|
|
|
except Exception as e:
|
|
print(f"detailContent error: {str(e)}")
|
|
return {'list': []}
|
|
|
|
# 搜索功能回退到原来的版本
|
|
def searchContent(self, key, quick, pg='1'):
|
|
result = {'list': [], 'page': int(pg) if pg else 1, 'pagecount': 1, 'limit': 90, 'total': 999999}
|
|
try:
|
|
encoded_key = quote_plus(key)
|
|
url = f'{self.baseUrl}/search?kw={encoded_key}'
|
|
if pg and int(pg) > 1:
|
|
url = f'{url}&page={pg}'
|
|
|
|
html = self.fetch(url, headers=self._get_header()).text
|
|
if not html:
|
|
return result
|
|
|
|
# 从HTML中提取搜索结果 - 使用原来的方法
|
|
video_items = []
|
|
|
|
# 查找搜索列表
|
|
search_pattern = r'<div class="wp-list[^"]*">(.*?)</div>\s*</div>'
|
|
search_match = re.search(search_pattern, html, re.S)
|
|
|
|
if search_match:
|
|
search_html = search_match.group(1)
|
|
# 匹配视频行
|
|
row_pattern = r'<div class="video-row"[^>]*>.*?<a[^>]*href="[^"]*/(\d+)/detail"[^>]*class="cover-link">.*?<img[^>]*class="cover"[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>.*?<a[^>]*class="title-link"[^>]*href="[^"]*/(\d+)/detail"[^>]*>([^<]*)</a>'
|
|
row_matches = re.findall(row_pattern, search_html, re.S)
|
|
|
|
for match in row_matches:
|
|
if len(match) >= 5:
|
|
video_items.append((match[0], match[1], match[2], match[4]))
|
|
|
|
# 如果没找到,使用备用方法
|
|
if not video_items:
|
|
item_pattern = r'<div class="video-row"[^>]*>.*?<a[^>]*href="/(\d+)/detail"[^>]*class="cover-link">.*?<img[^>]*class="cover"[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*>.*?<a[^>]*class="title-link"[^>]*href="[^"]*/(\d+)/detail"[^>]*>([^<]*)</a>'
|
|
row_matches = re.findall(item_pattern, html, re.S)
|
|
|
|
for match in row_matches:
|
|
if len(match) >= 5:
|
|
video_items.append((match[0], match[1], match[2], match[4]))
|
|
|
|
# 备用匹配 - 更宽松的正则
|
|
if not video_items:
|
|
item_pattern = r'<a[^>]*href="/(\d+)/detail"[^>]*>.*?<img[^>]*class="cover"[^>]*src="([^"]*)"[^>]*alt="([^"]*)"'
|
|
matches = re.findall(item_pattern, html, re.S)
|
|
|
|
for match in matches:
|
|
if len(match) >= 3:
|
|
video_items.append((match[0], match[1], match[2], match[2]))
|
|
|
|
for item in video_items:
|
|
try:
|
|
vod_id = item[0].strip()
|
|
vod_pic = item[1].strip() if len(item) > 1 else ""
|
|
vod_alt = item[2].strip() if len(item) > 2 else ""
|
|
vod_name = item[3].strip() if len(item) > 3 else vod_alt
|
|
|
|
if not vod_id or not vod_name:
|
|
continue
|
|
|
|
vod_name = self._clean_text(vod_name)
|
|
|
|
# 尝试获取评分
|
|
rating_pattern = rf'/{vod_id}/detail.*?<span class="rate-num">([^<]+)</span>'
|
|
rating_match = re.search(rating_pattern, html, re.S)
|
|
vod_rating = rating_match.group(1).strip() if rating_match else ""
|
|
|
|
remarks = []
|
|
if vod_rating and vod_rating != "0" and vod_rating != "0.0":
|
|
remarks.append(f"评分:{vod_rating}")
|
|
|
|
result['list'].append({
|
|
"vod_id": vod_id,
|
|
"vod_name": vod_name,
|
|
"vod_pic": vod_pic,
|
|
"vod_remarks": " ".join(remarks) if remarks else ""
|
|
})
|
|
except:
|
|
continue
|
|
|
|
# 尝试获取总页数
|
|
page_pattern = r'<a[^>]*href="[^"]*\?kw=[^&]*&page=(\d+)"[^>]*>'
|
|
page_matches = re.findall(page_pattern, html)
|
|
|
|
max_page = int(pg) if pg else 1
|
|
for page_num in page_matches:
|
|
if page_num.isdigit():
|
|
page_int = int(page_num)
|
|
if page_int > max_page:
|
|
max_page = page_int
|
|
|
|
result['pagecount'] = max_page if max_page > 0 else 1
|
|
|
|
except Exception as e:
|
|
print(f"searchContent error: {str(e)}")
|
|
|
|
return result
|
|
|
|
def playerContent(self, flag, id, vipFlags):
|
|
if id == "暂无资源":
|
|
return {"parse": 0, "url": ""}
|
|
|
|
if id.startswith('magnet:'):
|
|
return {"parse": 0, "url": id}
|
|
|
|
if 'pan.quark.cn' in id:
|
|
if not id.startswith('http'):
|
|
if id.startswith('//'):
|
|
id = f'https:{id}'
|
|
elif id.startswith('/'):
|
|
id = f'https://pan.quark.cn{id}'
|
|
else:
|
|
id = f'https://pan.quark.cn/{id}'
|
|
|
|
return {"parse": 0, "url": f"push://{id}"}
|
|
|
|
return {"parse": 0, "url": id, "header": self._get_header()}
|
|
|
|
# 辅助方法
|
|
def _parse_video_items(self, html):
|
|
"""解析视频列表项"""
|
|
video_items = []
|
|
|
|
# 主要匹配模式
|
|
patterns = [
|
|
r'<a class="video-item"[^>]*target="_blank"[^>]*href="[^"]*/(\d+)/detail"[^>]*title="([^"]*)"[^>]*>.*?<div class="wp-cover">.*?<img[^>]*src="([^"]*)"[^>]*alt="[^"]*"[^>]*>.*?<p>(.*?)</p>',
|
|
r'<a class="video-item"[^>]*href="[^"]*/(\d+)/detail"[^>]*>.*?<img[^>]*src="([^"]*)"[^>]*alt="([^"]*)"',
|
|
r'<a[^>]*href="/(\d+)/detail"[^>]*>.*?<img[^>]*class="cover"[^>]*src="([^"]*)"[^>]*alt="([^"]*)"'
|
|
]
|
|
|
|
for pattern in patterns:
|
|
matches = re.findall(pattern, html, re.S)
|
|
if matches:
|
|
video_items = matches
|
|
break
|
|
|
|
return video_items
|
|
|
|
def _extract_quark_url(self, url):
|
|
"""提取夸克网盘链接"""
|
|
try:
|
|
if not url:
|
|
return None
|
|
|
|
url = url.strip()
|
|
|
|
if '/go/play' in url:
|
|
match = re.search(r'url=([^&]+)', url)
|
|
if match:
|
|
return unquote(match.group(1))
|
|
|
|
if 'pan.quark.cn' in url:
|
|
if url.startswith('http'):
|
|
return url
|
|
elif url.startswith('//'):
|
|
return f'https:{url}'
|
|
elif url.startswith('/'):
|
|
return f'https://pan.quark.cn{url}'
|
|
else:
|
|
return f'https://pan.quark.cn/{url}'
|
|
|
|
return None
|
|
except:
|
|
return None
|
|
|
|
def _extract_description(self, html):
|
|
"""提取视频描述"""
|
|
desc_patterns = [
|
|
r'剧情简介[^<]*</h5>\s*<p[^>]*>(.*?)</p>',
|
|
r'<div[^>]*class="video-summary"[^>]*>.*?<div[^>]*class="meta"[^>]*>(.*?)</div>',
|
|
r'<div[^>]*class="vod-content"[^>]*>(.*?)</div>',
|
|
r'<p[^>]*class="[^"]*desc[^"]*"[^>]*>(.*?)</p>'
|
|
]
|
|
|
|
for pattern in desc_patterns:
|
|
desc_match = re.search(pattern, html, re.S)
|
|
if desc_match:
|
|
desc = desc_match.group(1).strip()
|
|
return self._clean_text(desc)
|
|
|
|
return ''
|
|
|
|
def _extract_video_info(self, html):
|
|
"""提取视频基本信息"""
|
|
info = {}
|
|
|
|
meta_pattern = r'<div[^>]*class="meta"[^>]*>(.*?)</div>'
|
|
meta_matches = re.findall(meta_pattern, html, re.S)
|
|
|
|
if meta_matches and len(meta_matches) >= 1:
|
|
meta1 = meta_matches[0]
|
|
parts = [p.strip() for p in meta1.split(' / ') if p.strip()]
|
|
|
|
if parts:
|
|
# 提取国家
|
|
country_match = re.search(r'([\u4e00-\u9fa5]+)', parts[0])
|
|
if country_match:
|
|
info['area'] = country_match.group(1)
|
|
|
|
# 提取类型
|
|
if len(parts) > 1:
|
|
type_match = re.search(r'([\u4e00-\u9fa5]+)', parts[1])
|
|
if type_match:
|
|
info['type'] = type_match.group(1)
|
|
|
|
# 提取时长
|
|
for part in parts:
|
|
if '分钟' in part:
|
|
info['remarks'] = part.strip()
|
|
|
|
if meta_matches and len(meta_matches) >= 2:
|
|
info['actor'] = meta_matches[1].strip()
|
|
|
|
return info
|
|
|
|
def _extract_play_resources(self, html):
|
|
"""提取播放资源"""
|
|
play_lines = []
|
|
|
|
# 夸克网盘链接
|
|
quark_pattern = r'<a[^>]*href="([^"]*pan\.quark\.cn[^"]*)"[^>]*>'
|
|
quark_matches = re.findall(quark_pattern, html, re.S)
|
|
|
|
for i, resource_url in enumerate(quark_matches[:5], 1):
|
|
play_url = self._extract_quark_url(resource_url)
|
|
if play_url:
|
|
play_lines.append(f"夸克网盘{i}${play_url}")
|
|
|
|
# 磁力链接
|
|
magnet_patterns = [
|
|
r'href="(magnet:\?[^"]+)"',
|
|
r'<a[^>]*href="(magnet:[^"]+)"[^>]*>'
|
|
]
|
|
|
|
for pattern in magnet_patterns:
|
|
matches = re.findall(pattern, html, re.S)
|
|
for i, match in enumerate(matches[:5], 1):
|
|
if isinstance(match, str) and match.startswith('magnet:'):
|
|
play_lines.append(f"磁力链接{i}${match}")
|
|
|
|
return play_lines
|
|
|
|
def _get_page_count(self, html, current_page):
|
|
"""提取总页数"""
|
|
current_page = int(current_page) if current_page else 1
|
|
|
|
# 检查加载更多按钮
|
|
load_more_pattern = r'<a[^>]*class="[^"]*btn-load[^"]*"[^>]*data-url="[^"]*\?page=(\d+)"[^>]*>'
|
|
load_more_match = re.search(load_more_pattern, html, re.S)
|
|
|
|
if load_more_match:
|
|
return current_page + 1
|
|
|
|
# 查找分页链接
|
|
page_patterns = [
|
|
r'<a[^>]*href="[^"]*\?page=(\d+)"[^>]*>',
|
|
r'class="page-numbers">(\d+)</a>',
|
|
r'page=(\d+)'
|
|
]
|
|
|
|
max_page = current_page
|
|
for pattern in page_patterns:
|
|
page_matches = re.findall(pattern, html)
|
|
for page_num in page_matches:
|
|
if isinstance(page_num, str) and page_num.isdigit():
|
|
page_int = int(page_num)
|
|
if page_int > max_page:
|
|
max_page = page_int
|
|
|
|
return max_page if max_page > 0 else 1
|
|
|
|
def _clean_text(self, text):
|
|
"""清理文本"""
|
|
if not text:
|
|
return text
|
|
|
|
replacements = {
|
|
''': "'", '&': '&', ' ': ' ', '"': '"',
|
|
'<': '<', '>': '>', '“': '"', '”': '"',
|
|
'‘': "'", '’': "'", '’': "'", '“': '"',
|
|
'”': '"', '…': '...', '&#39;': "'"
|
|
}
|
|
|
|
for old, new in replacements.items():
|
|
text = text.replace(old, new)
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
return text.strip()
|
|
|
|
def _get_header(self):
|
|
return {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
"Connection": "keep-alive",
|
|
"Referer": self.baseUrl
|
|
}
|
|
|
|
def localProxy(self, params):
|
|
pass
|
|
|
|
def isVideoFormat(self, url):
|
|
return False
|
|
|
|
def manualVideoCheck(self):
|
|
return []
|
|
|
|
def destroy(self):
|
|
pass |