Forráskód Böngészése

添加info_spider脚本 (#1)

* initialize

* initialize

* # This is a combination of 2 commits.
# This is the 1st commit message:

添加配置文件机制,黑白名单机制,优化线程容错率

# The commit message #2 will be skipped:

# 添加配置文件机制,黑白名单机制,优化线程容错率

* 添加配置文件机制,黑白名单机制,优化线程容错率

* 添加配置文件机制,黑白名单机制,优化线程容错率

* 添加并行线程数量配置

* 优化函数结构,完成excel写入模块

* 优化脚本,完成脚本文档

* 优化脚本,完成脚本文档
c-w-xiaohei 5 hónapja
szülő
commit
5fd106c9f5
5 módosított fájl, 290 hozzáadás és 1 törlés
  1. 1 1
      .gitignore
  2. 56 0
      info-spider/README.md
  3. 7 0
      info-spider/config.json
  4. 223 0
      info-spider/main.py
  5. 3 0
      info-spider/requirements.txt

+ 1 - 1
.gitignore

@@ -157,4 +157,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/

+ 56 - 0
info-spider/README.md

@@ -0,0 +1,56 @@
+# info-spider
+
+一个用于调用github官方api以获取社区仓库信息的脚本
+
+##  使用方式
+
+本脚本可以直接运行,也可以作为模块被导入
+
+### Requirements
+
+需要有python运行环境并安装依赖库
+
+```shell
+pip3 install -r requirements.txt
+```
+
+### **配置说明**
+
+* 若需更改文件输出及配置文件路径,可通过更改脚本中PATH变量的值,若为空则默认为脚本同一目录下
+
+```python
+PATH = " 这里填写文件输出以及配置文件路径 "
+```
+
+* 使用者需要在config.json中填写如下选项以更好的使用脚本
+  * **user** : 社区用户名,默认为DragonOS-Community
+  * **token** : 使用者的[github token](https://github.com/settings/tokens),用以增加访问访问次数(若不使用token则有每小时60次的访问限制,[查看详情](https://docs.github.com/zh/rest/overview/rate-limits-for-the-rest-api))
+  * **parallel_threads** : 最大并行线程数
+  * **black_list** : 仓库获取黑名单,列表中填写仓库的名称用于忽略该仓库中的contributor信息
+  * **white_list** : 黑名单中的白名单,列表中填写用户名,黑名单中的仓库会忽略除了白名单中的contributor
+
+### 直接运行
+
+使用命令行执行脚本生成.json文件以及.xls文件
+
+```shell
+python main.py
+```
+
+### 作为模块导入
+
+可以调用模块中的get_json()和get_dict()
+
+* **get_dict()** : 返回带有社区信息的python字典
+* **get_json()** : 返回带有社区信息的json文本
+
+## 添加统计条目
+
+如果后期需要添加社区仓库的统计条目,需要做以下改动
+
+1. 编写统计函数,参数为仓库信息字典,返回值字典{"条目名称":条目数据},并在脚本头部的**function_list**中填写函数名
+2. 将上述条目名称在脚本头部的head1中,作为最终输出在excel中的表头
+
+## TODO
+
+如果后期需要可以考虑进一步封装脚本

+ 7 - 0
info-spider/config.json

@@ -0,0 +1,7 @@
+{
+  "user": "DragonOS-Community",
+  "token": "",
+  "parallel_threads":8,
+  "black_list": ["grub","tar","gcc","acpi-rs","mini-backtrace","binutils","mpc","mpfr","gmp-6.2.1","flex","relibc"],
+  "": ["fslongjin"]
+}

+ 223 - 0
info-spider/main.py

@@ -0,0 +1,223 @@
+# -*- coding: UTF-8 -*-
+import time
+import requests
+import json
+from retry import retry
+import xlwt
+from os import path
+from sys import stdout
+from concurrent.futures import ThreadPoolExecutor
+
+__all__ = ["get_dict", "get_json"]
+
+function_list = ["get_cnt", "get_pr", "get_contributors"]  # 信息获取函数
+PATH = ""  # 文件输出路径以及配置文件存储路径,为空则默认在脚本文件同一目录下`
+head1 = ["name", "starred", "watching", "fork", "issue", "pull_request", "contributor"]  # 表头
+head2 = ["name", "contributions"]
+
+# 配置文件读取
+try:
+    with open(path.join(PATH, "config.json"), "r", encoding="utf-8") as f:
+        # 配置文件选项说明
+        dic = json.loads(f.read())
+        USER = dic["user"]  # 目标用户
+        TOKEN = dic["token"]  # github访问令牌,用于增加api访问次数
+        PARALLEL = dic["parallel_threads"]  # 最并行线程数
+        BLACKLIST = dic["black_list"]  # contributor获取的仓库黑名单
+        WHITELIST = dic["white_list"]  # 仓库黑名单中的contributor白名单
+
+    pool = ThreadPoolExecutor(max_workers=PARALLEL)
+except Exception as e:
+    print("There are some errors while getting configure information!\n")
+    raise e
+
+
+@retry(Exception, 5, 2, 8)
+def get_info(url):
+    """
+    :param url:请求的api链接
+    :return: py字典
+    """
+    headers = {"Authorization": "Bearer " + TOKEN}
+    response = requests.get(url=url, headers=headers).text
+    return json.loads(response)
+
+
+def get_repo(repo_dict):
+    """
+    :param repo_dict:仓库字典
+    :return: py字典
+    """
+    result = {"name": str(repo_dict.get("name")), "description": repo_dict.get("description")}
+    for fuc in function_list:
+        result.update(eval("%s(repo_dict)" % (fuc)))
+    return result
+
+
+def get_cnt(repo_dict):
+    result = {
+        "starred": repo_dict.get("stargazers_count"),
+        "watching": repo_dict.get("watchers_count"),
+        "fork": repo_dict.get("forks_count"),
+        "issue": repo_dict.get("open_issues_count"),
+    }
+    return result
+
+
+def get_pr(repo_dict):
+    pr_dict = get_info(r"https://api.github.com/repos/" + repo_dict["full_name"] + "/pulls")
+    return {"pull_request": len(pr_dict)}
+
+
+def get_contributors(repo_dict):
+    result = {"contributor_list": []}
+    contri_dict = get_info(repo_dict["contributors_url"])
+    for dic in contri_dict:
+        # 黑白名单实现
+        if repo_dict["name"] in BLACKLIST or repo_dict.get("parent"):
+            if dic["login"] not in WHITELIST:
+                continue
+        tmp = {
+            "name": dic["login"],
+            "id": dic["id"],
+            "contributions": dic["contributions"]
+        }
+        result["contributor_list"].append(tmp)
+    result["contributor"] = len(result["contributor_list"])
+    return result
+
+
+def sum_up(dic):
+    contribute_existed = {}
+    result = {"total": {
+        "starred": 0,
+        "watching": 0,
+        "fork": 0,
+        "issue": 0,
+        "pull_request": 0,
+        "contributor": 0,
+        "contributor_list": []
+    }}
+    pos = 0
+    for repo in dic["repositories"]:
+        for k in result["total"].keys():
+            if k != "contributor_list":
+                result["total"][k] += repo[k]
+            else:
+                # contributor累加
+                for contribute in repo[k]:
+                    if contribute_existed.get(contribute["name"]) is None:
+                        result["total"][k].append(contribute.copy())
+                        contribute_existed[contribute["name"]] = pos
+                        pos += 1
+                    else:
+                        result["total"][k][contribute_existed[contribute["name"]]]["contributions"] += \
+                            contribute["contributions"]
+    result["total"]["contributor_list"].sort(key=lambda a: a["contributions"], reverse=True)
+    result["total"]["contributor"] = len(contribute_existed)
+    dic.update(result)
+    return dic
+
+
+def get_dict():
+    """
+    :return:带有信息的py字典
+    """
+    # 获取用户信息
+    info_dict = {"repositories": []}
+    root_dict = get_info(r"https://api.github.com/users/" + USER + r"/repos")
+
+    # 解析信息
+    def thread(dic):
+        result = get_repo(dic)
+        info_dict["repositories"].append(result)
+        return 1
+
+    # 分别获取每个仓库
+    thread_list = []
+    wrong_list = []
+    for dic in root_dict:
+        thread_list.append(pool.submit(thread, dic))
+        time.sleep(0.05)
+        # 等待线程完毕
+    while thread_list:
+        for x in thread_list:
+            if x.done() and x.result():
+                thread_list.remove(x)
+            elif x.done() and not x.result():
+                wrong_list.append(x.exception())
+                thread_list.remove(x)
+            stdout.write('\r %d threads left. . .' % (len(thread_list)))
+
+    # 输出线程完成情况
+    stdout.write('\r Done!During the process,%d exceptions have been raised. . . ' % (len(wrong_list)))
+    stdout.flush()
+
+    if len(wrong_list):
+        for i in wrong_list:
+            stdout.write(str(i) + "\n")
+            stdout.flush()
+
+    # 按名字字母排序
+    info_dict["repositories"].sort(key=lambda a: a["name"].lower())
+    return sum_up(info_dict)
+
+
+def get_json(dic=None):
+    """
+    :return:带有信息的json文本
+    """
+    if not dic:
+        return json.dumps(get_dict(), sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False)
+    else:
+        return json.dumps(dic, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False)
+
+
+def wt_json(text):
+    if PATH:
+        with open(path.join(PATH, "github_info.json"), "w", encoding="utf-8") as f:
+            f.write(text)
+            f.flush()
+    else:
+        with open(path.join(PATH, "github_info.json"), "w", encoding="utf-8") as f:
+            f.write(text)
+            f.flush()
+
+
+def wt_excel(dic):
+    wb = xlwt.Workbook()
+    # try:
+    # 写入仓库数据
+    tb1 = wb.add_sheet("repositories", cell_overwrite_ok=True)
+    for i in range(len(head1)):
+        tb1.write(0, i, head1[i])
+    for i in range(len(dic["repositories"])):
+        for j in range(len(head1)):
+            tb1.write(i + 1, j, dic["repositories"][i][head1[j]])
+    # 写入总计数据
+    for i in range(len(head1)):
+        if head1[i] == "name":
+            tb1.write(len(dic["repositories"]) + 1, i, "Total")
+            continue
+        # if type(dic["total"][head1[i]]) == ("dict" or "list"):
+        #     tb1.write(len(dic["repositories"]) + 2, i, len(dic["total"][head1[i]]))
+        # else:
+        tb1.write(len(dic["repositories"]) + 1, i, dic["total"][head1[i]])
+    # 写入贡献者名单
+    tb2 = wb.add_sheet("contributor list", cell_overwrite_ok=True)
+    for i in range(len(head2)):
+        tb2.write(0, i, head2[i])
+    for i in range(len(dic["total"]["contributor_list"])):
+        for j in range(len(head2)):
+            tb2.write(i + 1, j, dic["total"]["contributor_list"][i][head2[j]])
+
+        # except Exception as e:
+        #     print("\n")
+        #     print(e)
+        wb.save(path.join(PATH, "statistics.xls"))
+
+
+if __name__ == '__main__':
+    dic = get_dict()
+    wt_json(get_json(dic))
+    wt_excel(dic)

+ 3 - 0
info-spider/requirements.txt

@@ -0,0 +1,3 @@
+requests~=2.31.0
+xlwt~=1.3.0
+retry~=0.9.2