###Third code of Transn Internship
def detect_contains_term(input_src, input_target):
detect_result = {}
# print('原文不存在')
data_contains_src = dot_term_info[dot_term_info['src_text'].str.contains(input_src)] # 查找src_text中包含input_src字符串的行
# print(len(data_contains_src))
if len(data_contains_src) > 0: # contains包含字符串查找
# 包含
detect_result['term_input_result'] = '存在相似术语'
data_contains_src = data_contains_src.drop_duplicates() # 去重
# data_contains_src_tran = pd.unique(data_contains_src['target_text'])
data_contains_src_tran = data_contains_src['target_text'].to_numpy() # 转为numpy格式,便于后续求相似度
# 求max_distance
distance = []
for contains_target_text in data_contains_src_tran:
distance.append(cal_distance(contains_target_text, input_target))
max_distance = max(distance)
max_distance_target = data_contains_src_tran[distance.index(max_distance)] # max_distance对应的译文
max_distance_target_src = data_contains_src[data_contains_src['target_text'] == max_distance_target]['src_text'].tolist() # max_distance对应的译文对应的原文,可能有重复
max_distance_target_term_id = data_contains_src[data_contains_src['target_text'] == max_distance_target]['term_id'].tolist() # max_distance对应的译文对应的term_id,可能有重复
# print(max_distance_target_src, max_distance_target)
detect_result['term_id'] = [i for i in max_distance_target_term_id]
detect_result['term_tran_text'] = max_distance_target
# input_src与取出的src_text的distance取average
src_distance = 0
for j in max_distance_target_src:
src_distance += cal_distance(input_src, j)
src_distance = src_distance/len(max_distance_target_src)
# 计算正确率
acc_pro = (max_distance+src_distance)/2
detect_result['term_ratio'] = acc_pro
else:
# 不包含
detect_result['term_input_result'] = '不存在相似的术语'
detect_result['term_id'] = ''
detect_result['term_tran_text'] = ''
detect_result['term_ratio'] = ''
# acc_pro = 'Non_existent,Non_contains,to be continue...'
return detect_result
def detect_contains_term(input_src, input_target):
detect_result = {}
# print('原文不存在')
data_contains_src = dot_term_info[dot_term_info['src_text'].str.contains(input_src)] # 查找src_text中包含input_src字符串的行
# print(len(data_contains_src))
if len(data_contains_src) > 0: # contains包含字符串查找
# 包含
detect_result['term_input_result'] = '存在相似术语'
data_contains_src = data_contains_src.drop_duplicates() # 去重
# data_contains_src_tran = pd.unique(data_contains_src['target_text'])
data_contains_src_tran = data_contains_src['target_text'].to_numpy() # 转为numpy格式,便于后续求相似度
# 求max_distance
distance = []
for contains_target_text in data_contains_src_tran:
distance.append(cal_distance(contains_target_text, input_target))
max_distance = max(distance)
max_distance_target = data_contains_src_tran[distance.index(max_distance)] # max_distance对应的译文
max_distance_target_src = data_contains_src[data_contains_src['target_text'] == max_distance_target]['src_text'].tolist() # max_distance对应的译文对应的原文,可能有重复
max_distance_target_term_id = data_contains_src[data_contains_src['target_text'] == max_distance_target]['term_id'].tolist() # max_distance对应的译文对应的term_id,可能有重复
# print(max_distance_target_src, max_distance_target)
detect_result['term_id'] = [i for i in max_distance_target_term_id]
detect_result['term_tran_text'] = max_distance_target
# input_src与取出的src_text的distance取average
src_distance = 0
for j in max_distance_target_src:
src_distance += cal_distance(input_src, j)
src_distance = src_distance/len(max_distance_target_src)
# 计算正确率
acc_pro = (max_distance+src_distance)/2
detect_result['term_ratio'] = acc_pro
else:
# 不包含
detect_result['term_input_result'] = '不存在相似的术语'
detect_result['term_id'] = ''
detect_result['term_tran_text'] = ''
detect_result['term_ratio'] = ''
# acc_pro = 'Non_existent,Non_contains,to be continue...'
return detect_result
###Third code of Transn Internship
def detect_match_term(input_target, data_match_src):
detect_result = {}
# print('原文存在')
data_match_src_tran = pd.unique(data_match_src['target_text']) # 取出原文对应的译文的行,去重
# [2.1.1] 计算input_target和target_text之间的max_distance
distance = []
for target_text in data_match_src_tran:
distance.append(cal_distance(target_text, input_target))
max_distance = max(distance) # when max_distance==1,input_target exist,else not exist
if max_distance == 1:
# print('原文存在')
detect_result['term_input_result'] = '存在一样术语'
else:
# print('译文不存在')
detect_result['term_input_result'] = '存在译文不一样术语'
max_distance_target = data_match_src_tran[distance.index(max_distance)] # 取出max_distance对应的译文
max_distance_target_term_id = dot_term_info.loc[dot_term_info['target_text'] == max_distance_target]['term_id'].tolist() #取出max_distance对应的译文的term_id
detect_result['term_id'] = [i for i in max_distance_target_term_id]
detect_result['term_tran_text'] = max_distance_target
# print('术语库中存在的原文及与输入译文最相近对应的译文为:', input_src, max_distance_target)
# max_distance_target
# [2.1.2]
conditional_pro = len(data_match_src.loc[data_match_src['target_text'] == max_distance_target]) / len( data_match_src) # 计算条件概率
# [2.1.3]
acc_pro = (max_distance + conditional_pro) / 2 # 计算正确率
detect_result['term_ratio'] = acc_pro
return detect_result
def detect_match_term(input_target, data_match_src):
detect_result = {}
# print('原文存在')
data_match_src_tran = pd.unique(data_match_src['target_text']) # 取出原文对应的译文的行,去重
# [2.1.1] 计算input_target和target_text之间的max_distance
distance = []
for target_text in data_match_src_tran:
distance.append(cal_distance(target_text, input_target))
max_distance = max(distance) # when max_distance==1,input_target exist,else not exist
if max_distance == 1:
# print('原文存在')
detect_result['term_input_result'] = '存在一样术语'
else:
# print('译文不存在')
detect_result['term_input_result'] = '存在译文不一样术语'
max_distance_target = data_match_src_tran[distance.index(max_distance)] # 取出max_distance对应的译文
max_distance_target_term_id = dot_term_info.loc[dot_term_info['target_text'] == max_distance_target]['term_id'].tolist() #取出max_distance对应的译文的term_id
detect_result['term_id'] = [i for i in max_distance_target_term_id]
detect_result['term_tran_text'] = max_distance_target
# print('术语库中存在的原文及与输入译文最相近对应的译文为:', input_src, max_distance_target)
# max_distance_target
# [2.1.2]
conditional_pro = len(data_match_src.loc[data_match_src['target_text'] == max_distance_target]) / len( data_match_src) # 计算条件概率
# [2.1.3]
acc_pro = (max_distance + conditional_pro) / 2 # 计算正确率
detect_result['term_ratio'] = acc_pro
return detect_result
###Third code of Transn Internship
import pandas as pd
import jieba
from utils.distance_util import cal_distance
# term_info
dot_term_info = pd.read_csv(r'C:\Users\Administrator\Desktop\艾佳\Internship\Detect_term\dot_term_info.csv',
names=['term_id','src_text', 'target_text'], na_filter=False)
# dot_term_info.info()
def detect_term(input_src, input_target):
# [2.1]查找原文是否存在
data_match_src = dot_term_info.loc[lambda x: x['src_text'] == input_src] # 精确查找
# print(len(data_match_src))
if len(data_match_src) >= 1:
# print('原文存在')
detect_result = detect_match_term(input_target, data_match_src)
else:
# print('原文不存在,将查找包含原文字符串的术语')
detect_result = detect_contains_term(input_src, input_target)
return detect_result
import pandas as pd
import jieba
from utils.distance_util import cal_distance
# term_info
dot_term_info = pd.read_csv(r'C:\Users\Administrator\Desktop\艾佳\Internship\Detect_term\dot_term_info.csv',
names=['term_id','src_text', 'target_text'], na_filter=False)
# dot_term_info.info()
def detect_term(input_src, input_target):
# [2.1]查找原文是否存在
data_match_src = dot_term_info.loc[lambda x: x['src_text'] == input_src] # 精确查找
# print(len(data_match_src))
if len(data_match_src) >= 1:
# print('原文存在')
detect_result = detect_match_term(input_target, data_match_src)
else:
# print('原文不存在,将查找包含原文字符串的术语')
detect_result = detect_contains_term(input_src, input_target)
return detect_result
✋热门推荐