python/tools/check_links.py at master · flypythoncom/python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FlyPython 链接检查工具
用于定期检查README文件中所有外部链接的有效性
"""

import re
import requests
import time
import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

ROOT_DIR = Path(__file__).resolve().parent.parent
REPORTS_DIR = ROOT_DIR / 'reports'

class LinkChecker:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.timeout = 10
        self.results = {
            'working': [],
            'broken': [],
            'redirect': [],
            'timeout': [],
            'unknown': []
        }

    def extract_links_from_file(self, filename):
        """从markdown文件中提取所有外部链接"""
        filename = str(filename)
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f"无法读取文件 {filename}: {e}")
            return []

        # 匹配markdown链接格式 [text](url)
        markdown_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content)

        # 匹配纯链接格式
        url_pattern = r'https?://[^\s\])\}]+'
        plain_links = re.findall(url_pattern, content)

        links = []

        # 处理markdown链接
        for text, url in markdown_links:
            if url.startswith('http'):
                links.append({
                    'text': text,
                    'url': url,
                    'file': filename,
                    'type': 'markdown'
                })

        # 处理纯链接
        for url in plain_links:
            # 避免重复
            if not any(link['url'] == url for link in links):
                links.append({
                    'text': url,
                    'url': url,
                    'file': filename,
                    'type': 'plain'
                })

        return links

    def check_link(self, link):
        """检查单个链接的状态"""
        url = link['url']
        try:
            response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
            status_code = response.status_code

            if status_code == 200:
                link['status'] = 'working'
                link['status_code'] = status_code
                self.results['working'].append(link)
            elif 300 <= status_code < 400:
                link['status'] = 'redirect'
                link['status_code'] = status_code
                link['final_url'] = response.url
                self.results['redirect'].append(link)
            else:
                # 尝试GET请求，有些网站不支持HEAD
                try:
                    response = self.session.get(url, timeout=self.timeout)
                    if response.status_code == 200:
                        link['status'] = 'working'
                        link['status_code'] = response.status_code
                        self.results['working'].append(link)
                    else:
                        link['status'] = 'broken'
                        link['status_code'] = response.status_code
                        self.results['broken'].append(link)
                except:
                    link['status'] = 'broken'
                    link['status_code'] = status_code
                    self.results['broken'].append(link)

        except requests.exceptions.Timeout:
            link['status'] = 'timeout'
            link['error'] = 'Request timeout'
            self.results['timeout'].append(link)

        except requests.exceptions.RequestException as e:
            link['status'] = 'unknown'
            link['error'] = str(e)
            self.results['unknown'].append(link)

        return link

    def check_all_links(self, links, max_workers=10):
        """并发检查所有链接"""
        print(f"开始检查 {len(links)} 个链接...")

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_link = {executor.submit(self.check_link, link): link for link in links}

            for i, future in enumerate(as_completed(future_to_link), 1):
                link = future_to_link[future]
                try:
                    result = future.result()
                    status = result.get('status', 'unknown')
                    print(f"[{i}/{len(links)}] {status.upper()}: {result['url']}")
                    time.sleep(0.1)
                except Exception as e:
                    print(f"检查链接时出错 {link['url']}: {e}")

    def generate_report(self):
        """生成检查报告"""
        total = sum(len(links) for links in self.results.values())

        print("\n" + "="*60)
        print("链接检查报告")
        print("="*60)
        print(f"总链接数: {total}")
        print(f"正常链接: {len(self.results['working'])}")
        print(f"重定向链接: {len(self.results['redirect'])}")
        print(f"失效链接: {len(self.results['broken'])}")
        print(f"超时链接: {len(self.results['timeout'])}")
        print(f"未知状态: {len(self.results['unknown'])}")

        # 保存详细结果
        REPORTS_DIR.mkdir(exist_ok=True)
        with open(REPORTS_DIR / 'link_check_results.json', 'w', encoding='utf-8') as f:
            json.dump(self.results, f, ensure_ascii=False, indent=2)

        print(f"\n详细结果已保存到: reports/link_check_results.json")

def main():
    checker = LinkChecker()

    # 从README文件提取链接 (相对于项目根目录)
    files_to_check = [ROOT_DIR / 'index.md', ROOT_DIR / 'zh-cn.md']
    all_links = []

    for filename in files_to_check:
        print(f"从 {filename} 提取链接...")
        links = checker.extract_links_from_file(filename)
        all_links.extend(links)
        print(f"找到 {len(links)} 个链接")

    if not all_links:
        print("没有找到任何链接!")
        return

    # 去重
    unique_links = []
    seen_urls = set()
    for link in all_links:
        if link['url'] not in seen_urls:
            unique_links.append(link)
            seen_urls.add(link['url'])

    print(f"去重后共 {len(unique_links)} 个唯一链接")

    # 检查链接
    checker.check_all_links(unique_links)

    # 生成报告
    checker.generate_report()

if __name__ == '__main__':
    main()