5 hónapja · 5fd106c9f5
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,4 @@ cython_debug/
 
				 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
			
 
				 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
			
 
				 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
			
 
				-#.idea/
			
 
				+.idea/
			
--- a/info-spider/README.md
+++ b/info-spider/README.md
@@ -0,0 +1,56 @@
 
				+# info-spider
			
 
				+
			
 
				+一个用于调用github官方api以获取社区仓库信息的脚本
			
 
				+
			
 
				+##  使用方式
			
 
				+
			
 
				+本脚本可以直接运行，也可以作为模块被导入
			
 
				+
			
 
				+### Requirements
			
 
				+
			
 
				+需要有python运行环境并安装依赖库
			
 
				+
			
 
				+```shell
			
 
				+pip3 install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+### **配置说明**
			
 
				+
			
 
				+* 若需更改文件输出及配置文件路径，可通过更改脚本中PATH变量的值，若为空则默认为脚本同一目录下
			
 
				+
			
 
				+```python
			
 
				+PATH = " 这里填写文件输出以及配置文件路径 "
			
 
				+```
			
 
				+
			
 
				+* 使用者需要在config.json中填写如下选项以更好的使用脚本
			
 
				+  * **user** : 社区用户名，默认为DragonOS-Community
			
 
				+  * **token** : 使用者的[github token](https://github.com/settings/tokens)，用以增加访问访问次数(若不使用token则有每小时60次的访问限制，[查看详情](https://docs.github.com/zh/rest/overview/rate-limits-for-the-rest-api))
			
 
				+  * **parallel_threads** : 最大并行线程数
			
 
				+  * **black_list** : 仓库获取黑名单，列表中填写仓库的名称用于忽略该仓库中的contributor信息
			
 
				+  * **white_list** : 黑名单中的白名单，列表中填写用户名，黑名单中的仓库会忽略除了白名单中的contributor
			
 
				+
			
 
				+### 直接运行
			
 
				+
			
 
				+使用命令行执行脚本生成.json文件以及.xls文件
			
 
				+
			
 
				+```shell
			
 
				+python main.py
			
 
				+```
			
 
				+
			
 
				+### 作为模块导入
			
 
				+
			
 
				+可以调用模块中的get_json()和get_dict()
			
 
				+
			
 
				+* **get_dict()** : 返回带有社区信息的python字典
			
 
				+* **get_json()** : 返回带有社区信息的json文本
			
 
				+
			
 
				+## 添加统计条目
			
 
				+
			
 
				+如果后期需要添加社区仓库的统计条目，需要做以下改动
			
 
				+
			
 
				+1. 编写统计函数，参数为仓库信息字典，返回值字典{"条目名称":条目数据}，并在脚本头部的**function_list**中填写函数名
			
 
				+2. 将上述条目名称在脚本头部的head1中，作为最终输出在excel中的表头
			
 
				+
			
 
				+## TODO
			
 
				+
			
 
				+如果后期需要可以考虑进一步封装脚本
			
--- a/info-spider/config.json
+++ b/info-spider/config.json
@@ -0,0 +1,7 @@
 
				+{
			
 
				+  "user": "DragonOS-Community",
			
 
				+  "token": "",
			
 
				+  "parallel_threads":8,
			
 
				+  "black_list": ["grub","tar","gcc","acpi-rs","mini-backtrace","binutils","mpc","mpfr","gmp-6.2.1","flex","relibc"],
			
 
				+  "": ["fslongjin"]
			
 
				+}
			
--- a/info-spider/main.py
+++ b/info-spider/main.py
@@ -0,0 +1,223 @@
 
				+# -*- coding: UTF-8 -*-
			
 
				+import time
			
 
				+import requests
			
 
				+import json
			
 
				+from retry import retry
			
 
				+import xlwt
			
 
				+from os import path
			
 
				+from sys import stdout
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+
			
 
				+__all__ = ["get_dict", "get_json"]
			
 
				+
			
 
				+function_list = ["get_cnt", "get_pr", "get_contributors"]  # 信息获取函数
			
 
				+PATH = ""  # 文件输出路径以及配置文件存储路径，为空则默认在脚本文件同一目录下`
			
 
				+head1 = ["name", "starred", "watching", "fork", "issue", "pull_request", "contributor"]  # 表头
			
 
				+head2 = ["name", "contributions"]
			
 
				+
			
 
				+# 配置文件读取
			
 
				+try:
			
 
				+    with open(path.join(PATH, "config.json"), "r", encoding="utf-8") as f:
			
 
				+        # 配置文件选项说明
			
 
				+        dic = json.loads(f.read())
			
 
				+        USER = dic["user"]  # 目标用户
			
 
				+        TOKEN = dic["token"]  # github访问令牌，用于增加api访问次数
			
 
				+        PARALLEL = dic["parallel_threads"]  # 最并行线程数
			
 
				+        BLACKLIST = dic["black_list"]  # contributor获取的仓库黑名单
			
 
				+        WHITELIST = dic["white_list"]  # 仓库黑名单中的contributor白名单
			
 
				+
			
 
				+    pool = ThreadPoolExecutor(max_workers=PARALLEL)
			
 
				+except Exception as e:
			
 
				+    print("There are some errors while getting configure information!\n")
			
 
				+    raise e
			
 
				+
			
 
				+
			
 
				+@retry(Exception, 5, 2, 8)
			
 
				+def get_info(url):
			
 
				+    """
			
 
				+    :param url:请求的api链接
			
 
				+    :return: py字典
			
 
				+    """
			
 
				+    headers = {"Authorization": "Bearer " + TOKEN}
			
 
				+    response = requests.get(url=url, headers=headers).text
			
 
				+    return json.loads(response)
			
 
				+
			
 
				+
			
 
				+def get_repo(repo_dict):
			
 
				+    """
			
 
				+    :param repo_dict:仓库字典
			
 
				+    :return: py字典
			
 
				+    """
			
 
				+    result = {"name": str(repo_dict.get("name")), "description": repo_dict.get("description")}
			
 
				+    for fuc in function_list:
			
 
				+        result.update(eval("%s(repo_dict)" % (fuc)))
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def get_cnt(repo_dict):
			
 
				+    result = {
			
 
				+        "starred": repo_dict.get("stargazers_count"),
			
 
				+        "watching": repo_dict.get("watchers_count"),
			
 
				+        "fork": repo_dict.get("forks_count"),
			
 
				+        "issue": repo_dict.get("open_issues_count"),
			
 
				+    }
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def get_pr(repo_dict):
			
 
				+    pr_dict = get_info(r"https://api.github.com/repos/" + repo_dict["full_name"] + "/pulls")
			
 
				+    return {"pull_request": len(pr_dict)}
			
 
				+
			
 
				+
			
 
				+def get_contributors(repo_dict):
			
 
				+    result = {"contributor_list": []}
			
 
				+    contri_dict = get_info(repo_dict["contributors_url"])
			
 
				+    for dic in contri_dict:
			
 
				+        # 黑白名单实现
			
 
				+        if repo_dict["name"] in BLACKLIST or repo_dict.get("parent"):
			
 
				+            if dic["login"] not in WHITELIST:
			
 
				+                continue
			
 
				+        tmp = {
			
 
				+            "name": dic["login"],
			
 
				+            "id": dic["id"],
			
 
				+            "contributions": dic["contributions"]
			
 
				+        }
			
 
				+        result["contributor_list"].append(tmp)
			
 
				+    result["contributor"] = len(result["contributor_list"])
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def sum_up(dic):
			
 
				+    contribute_existed = {}
			
 
				+    result = {"total": {
			
 
				+        "starred": 0,
			
 
				+        "watching": 0,
			
 
				+        "fork": 0,
			
 
				+        "issue": 0,
			
 
				+        "pull_request": 0,
			
 
				+        "contributor": 0,
			
 
				+        "contributor_list": []
			
 
				+    }}
			
 
				+    pos = 0
			
 
				+    for repo in dic["repositories"]:
			
 
				+        for k in result["total"].keys():
			
 
				+            if k != "contributor_list":
			
 
				+                result["total"][k] += repo[k]
			
 
				+            else:
			
 
				+                # contributor累加
			
 
				+                for contribute in repo[k]:
			
 
				+                    if contribute_existed.get(contribute["name"]) is None:
			
 
				+                        result["total"][k].append(contribute.copy())
			
 
				+                        contribute_existed[contribute["name"]] = pos
			
 
				+                        pos += 1
			
 
				+                    else:
			
 
				+                        result["total"][k][contribute_existed[contribute["name"]]]["contributions"] += \
			
 
				+                            contribute["contributions"]
			
 
				+    result["total"]["contributor_list"].sort(key=lambda a: a["contributions"], reverse=True)
			
 
				+    result["total"]["contributor"] = len(contribute_existed)
			
 
				+    dic.update(result)
			
 
				+    return dic
			
 
				+
			
 
				+
			
 
				+def get_dict():
			
 
				+    """
			
 
				+    :return:带有信息的py字典
			
 
				+    """
			
 
				+    # 获取用户信息
			
 
				+    info_dict = {"repositories": []}
			
 
				+    root_dict = get_info(r"https://api.github.com/users/" + USER + r"/repos")
			
 
				+
			
 
				+    # 解析信息
			
 
				+    def thread(dic):
			
 
				+        result = get_repo(dic)
			
 
				+        info_dict["repositories"].append(result)
			
 
				+        return 1
			
 
				+
			
 
				+    # 分别获取每个仓库
			
 
				+    thread_list = []
			
 
				+    wrong_list = []
			
 
				+    for dic in root_dict:
			
 
				+        thread_list.append(pool.submit(thread, dic))
			
 
				+        time.sleep(0.05)
			
 
				+        # 等待线程完毕
			
 
				+    while thread_list:
			
 
				+        for x in thread_list:
			
 
				+            if x.done() and x.result():
			
 
				+                thread_list.remove(x)
			
 
				+            elif x.done() and not x.result():
			
 
				+                wrong_list.append(x.exception())
			
 
				+                thread_list.remove(x)
			
 
				+            stdout.write('\r %d threads left. . .' % (len(thread_list)))
			
 
				+
			
 
				+    # 输出线程完成情况
			
 
				+    stdout.write('\r Done!During the process,%d exceptions have been raised. . . ' % (len(wrong_list)))
			
 
				+    stdout.flush()
			
 
				+
			
 
				+    if len(wrong_list):
			
 
				+        for i in wrong_list:
			
 
				+            stdout.write(str(i) + "\n")
			
 
				+            stdout.flush()
			
 
				+
			
 
				+    # 按名字字母排序
			
 
				+    info_dict["repositories"].sort(key=lambda a: a["name"].lower())
			
 
				+    return sum_up(info_dict)
			
 
				+
			
 
				+
			
 
				+def get_json(dic=None):
			
 
				+    """
			
 
				+    :return:带有信息的json文本
			
 
				+    """
			
 
				+    if not dic:
			
 
				+        return json.dumps(get_dict(), sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False)
			
 
				+    else:
			
 
				+        return json.dumps(dic, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False)
			
 
				+
			
 
				+
			
 
				+def wt_json(text):
			
 
				+    if PATH:
			
 
				+        with open(path.join(PATH, "github_info.json"), "w", encoding="utf-8") as f:
			
 
				+            f.write(text)
			
 
				+            f.flush()
			
 
				+    else:
			
 
				+        with open(path.join(PATH, "github_info.json"), "w", encoding="utf-8") as f:
			
 
				+            f.write(text)
			
 
				+            f.flush()
			
 
				+
			
 
				+
			
 
				+def wt_excel(dic):
			
 
				+    wb = xlwt.Workbook()
			
 
				+    # try:
			
 
				+    # 写入仓库数据
			
 
				+    tb1 = wb.add_sheet("repositories", cell_overwrite_ok=True)
			
 
				+    for i in range(len(head1)):
			
 
				+        tb1.write(0, i, head1[i])
			
 
				+    for i in range(len(dic["repositories"])):
			
 
				+        for j in range(len(head1)):
			
 
				+            tb1.write(i + 1, j, dic["repositories"][i][head1[j]])
			
 
				+    # 写入总计数据
			
 
				+    for i in range(len(head1)):
			
 
				+        if head1[i] == "name":
			
 
				+            tb1.write(len(dic["repositories"]) + 1, i, "Total")
			
 
				+            continue
			
 
				+        # if type(dic["total"][head1[i]]) == ("dict" or "list"):
			
 
				+        #     tb1.write(len(dic["repositories"]) + 2, i, len(dic["total"][head1[i]]))
			
 
				+        # else:
			
 
				+        tb1.write(len(dic["repositories"]) + 1, i, dic["total"][head1[i]])
			
 
				+    # 写入贡献者名单
			
 
				+    tb2 = wb.add_sheet("contributor list", cell_overwrite_ok=True)
			
 
				+    for i in range(len(head2)):
			
 
				+        tb2.write(0, i, head2[i])
			
 
				+    for i in range(len(dic["total"]["contributor_list"])):
			
 
				+        for j in range(len(head2)):
			
 
				+            tb2.write(i + 1, j, dic["total"]["contributor_list"][i][head2[j]])
			
 
				+
			
 
				+        # except Exception as e:
			
 
				+        #     print("\n")
			
 
				+        #     print(e)
			
 
				+        wb.save(path.join(PATH, "statistics.xls"))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    dic = get_dict()
			
 
				+    wt_json(get_json(dic))
			
 
				+    wt_excel(dic)
			
--- a/info-spider/requirements.txt
+++ b/info-spider/requirements.txt
@@ -0,0 +1,3 @@
 
				+requests~=2.31.0
			
 
				+xlwt~=1.3.0
			
 
				+retry~=0.9.2