dupPRdetect/detect_gitee.py

71 lines
1.8 KiB
Python

import requests
import json
import jieba
import difflib
from fuzzywuzzy import fuzz
import numpy as np
from collections import Counter
token = "c8648d8381aa3e64b6778d6ee48602e9"
def crawl_api(url):
header = {"Authorization": "token %s" % token}
response = requests.get(url, headers=header, stream=True)
if response.status_code != 200:
return None
return response.json()
def dup_detect(src_api, dst_api, src_file, dst_file):
src_txt = src_api["title"] + " " + src_api["body"]
dst_txt = dst_api["title"] + " " + dst_api["body"]
src_word = jieba.lcut(src_txt)
dst_word = jieba.lcut(dst_txt)
diff_result = difflib.SequenceMatcher(None, src_word, dst_word).ratio() # difflib计算分值
return diff_result
def extract_filename(file_dic):
filename = []
for file in file_dic:
filename.append(file["filename"])
return filename
def main(src_url, dst_url):
src_api = crawl_api(src_url)
dst_api = crawl_api(dst_url)
if src_api is None or dst_api is None:
return None, None
src_file = crawl_api(src_url+"/files")
dst_file = crawl_api(dst_url+"/files")
text_score = dup_detect(src_api, dst_api, src_file, dst_file)
src_file = extract_filename(src_file)
dst_file = extract_filename(dst_file)
file_score = difflib.SequenceMatcher(None, src_file, dst_file).ratio() # difflib计算分值
return text_score, file_score
def final_decision(txt_score, file_score):
score = (txt_score + file_score) / 2
if score >= 0 and score < 0.2:
result = "极不可能"
if score >= 0.2 and score < 0.4:
result = "不可能"
if score >= 0.4 and score < 0.6:
result = "有可能"
if score >= 0.6 and score < 0.8:
result = "很有可能"
if score >= 0.8 and score <= 1:
result = "极有可能"
return result, score