forked from zhangyu19/dupPRdetect
71 lines
1.8 KiB
Python
71 lines
1.8 KiB
Python
import requests
|
|
import json
|
|
import jieba
|
|
import difflib
|
|
from fuzzywuzzy import fuzz
|
|
import numpy as np
|
|
from collections import Counter
|
|
|
|
token = "c8648d8381aa3e64b6778d6ee48602e9"
|
|
|
|
def crawl_api(url):
|
|
header = {"Authorization": "token %s" % token}
|
|
response = requests.get(url, headers=header, stream=True)
|
|
if response.status_code != 200:
|
|
return None
|
|
return response.json()
|
|
|
|
|
|
def dup_detect(src_api, dst_api, src_file, dst_file):
|
|
src_txt = src_api["title"] + " " + src_api["body"]
|
|
dst_txt = dst_api["title"] + " " + dst_api["body"]
|
|
|
|
src_word = jieba.lcut(src_txt)
|
|
dst_word = jieba.lcut(dst_txt)
|
|
|
|
diff_result = difflib.SequenceMatcher(None, src_word, dst_word).ratio() # difflib计算分值
|
|
|
|
return diff_result
|
|
|
|
|
|
def extract_filename(file_dic):
|
|
filename = []
|
|
for file in file_dic:
|
|
filename.append(file["filename"])
|
|
return filename
|
|
|
|
def main(src_url, dst_url):
|
|
src_api = crawl_api(src_url)
|
|
dst_api = crawl_api(dst_url)
|
|
|
|
if src_api is None or dst_api is None:
|
|
return None, None
|
|
|
|
src_file = crawl_api(src_url+"/files")
|
|
dst_file = crawl_api(dst_url+"/files")
|
|
|
|
text_score = dup_detect(src_api, dst_api, src_file, dst_file)
|
|
|
|
src_file = extract_filename(src_file)
|
|
dst_file = extract_filename(dst_file)
|
|
|
|
file_score = difflib.SequenceMatcher(None, src_file, dst_file).ratio() # difflib计算分值
|
|
|
|
return text_score, file_score
|
|
|
|
def final_decision(txt_score, file_score):
|
|
score = (txt_score + file_score) / 2
|
|
|
|
if score >= 0 and score < 0.2:
|
|
result = "极不可能"
|
|
if score >= 0.2 and score < 0.4:
|
|
result = "不可能"
|
|
if score >= 0.4 and score < 0.6:
|
|
result = "有可能"
|
|
if score >= 0.6 and score < 0.8:
|
|
result = "很有可能"
|
|
if score >= 0.8 and score <= 1:
|
|
result = "极有可能"
|
|
|
|
return result, score
|