encode
import csv
import pandas as pd
def read_taskid(path,index):
df = pd.read_csv(path)
taskid = df.iloc[:,index]
return taskid
def write_lists_to_csv_columns(lists, header, filename):
# Transpose the lists
transposed_lists = list(map(list, zip(*lists)))
# Write to CSV
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header)
writer.writerows(transposed_lists)
def encode_str_use_dir(string_list, string_to_number):
# 将字符串列表中的字符串替换为对应的数字
number_list = [string_to_number[string] for string in string_list]
return number_list
def make_dir_use_str(strings):
# 创建一个空字典,用于存储字符串到数字的映射关系
string_to_number = {}
# 计数器,用于连续编号
counter = 0
# 遍历字符串列表,确保连续编号
for string in strings:
if string not in string_to_number:
string_to_number[string] = counter
counter += 1
return string_to_number
# def save_dict_to_csv(dictionary, filename):
# with open(filename, 'w', newline='') as csvfile:
# fieldnames = list(dictionary.keys())
# writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#
# writer.writeheader()
# writer.writerow(dictionary)
def save_dict_to_csv(dictionary, filename):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for key, value in dictionary.items():
writer.writerow([key, value])
def encode_list(lst):
encoding_dict = {}
encoded_list = []
code = 0
for item in lst:
if item not in encoding_dict:
encoding_dict[item] = code
code += 1
encoded_list.append(encoding_dict[item])
# print(encoding_dict)
return encoded_list, encoding_dict
def read_field_csv(path, index_of_data):
# 打开 CSV 文件
with open(path, 'r', newline='', encoding='utf-8') as csvfile:
# 创建 CSV 读取器
reader = csv.reader(csvfile)
# 跳过第一行表头
next(reader)
# 读取第四列数据
fourth_column_data = []
for row in reader:
split_v = row[index_of_data].split('.')
fourth_column_data.append(split_v) # 注意,Python 中的索引是从0开始的
for col_v in fourth_column_data:
if len(col_v) == 2:
print("xxx")
# # 输出第四列数据
# print(fourth_column_data[299])
# print(fourth_column_data[299][0])
# print(fourth_column_data[299][1])
# print(fourth_column_data[299][2])
db = [col_vs[0] for col_vs in fourth_column_data]
tb = [col_vs[1] for col_vs in fourth_column_data]
fd = [col_vs[2] for col_vs in fourth_column_data]
# 字段全数据是fourth_column_data 用fourth_column_data代替fd_tb_db
# fd_tb_db = [col_vs[:3] for col_vs in fourth_column_data]
tb_db = [col_vs[:2] for col_vs in fourth_column_data]
return fourth_column_data, db, tb, fd, tb_db
f_path = '../data/字段血缘-带任务.csv'
column_data_from, db_from, tb_from, fd_from, tb_db_from = read_field_csv(f_path, 3)
column_data_to, db_to, tb_to, fd_to, tb_db_to = read_field_csv(f_path, 4)
# print(f"all : {len(fourth_column_data)}, db : {len(db)}, tb : {len(tb)}, fd : {len(fd)}")
# db_from_ecd, db_from_dict = encode_list(db_from)
# tb_from_ecd, tb_from_dict = encode_list(tb_from)
# fd_from_ecd, fd_from_dict = encode_list(fd_from)
#
# print(f"all : {len(column_data_from)}, db : {len(db_from_ecd)}, tb : {len(tb_from_ecd)}, fd : {len(fd_from_ecd)}")
# print(f"all : {len(column_data_from)}, db uni : {len(set(db_from_ecd))}, tb uni: {len(set(tb_from_ecd))}, fd uni: {len(set(fd_from_ecd))}")
# print(f"all : {len(column_data_from)}, db dict: {len(db_from_ecd)}, tb dict: {len(tb_from_ecd)}, fd dict: {len(fd_from_ecd)}")
# save_dict_to_csv(db_from_dict, './process_ret/db_dict.csv')
# save_dict_to_csv(tb_from_dict, './process_ret/tb_dict.csv')
# save_dict_to_csv(fd_from_dict, './process_ret/fd_dict.csv')
# 全部出现的数据
db_for_make_dir = db_from + db_to
tb_for_make_dir = tb_from + tb_to
fd_for_make_dir = fd_from + fd_to
# 全部词典
db_dir = make_dir_use_str(db_for_make_dir)
tb_dir = make_dir_use_str(tb_for_make_dir)
fd_dir = make_dir_use_str(fd_for_make_dir)
_, db_dict = encode_list(db_for_make_dir)
_, tb_dict = encode_list(tb_for_make_dir)
_, fd_dict = encode_list(fd_for_make_dir)
save_dict_to_csv(db_dir, './process_ret/db_dict.csv')
save_dict_to_csv(tb_dir, './process_ret/tb_dict.csv')
save_dict_to_csv(fd_dir, './process_ret/fd_dict.csv')
# from编码
db_from_ecd = encode_str_use_dir(db_from, db_dict)
tb_from_ecd = encode_str_use_dir(tb_from, tb_dict)
fd_from_ecd = encode_str_use_dir(fd_from, fd_dict)
# to编码
db_to_ecd = encode_str_use_dir(db_to, db_dir)
tb_to_ecd = encode_str_use_dir(tb_to, tb_dir)
fd_to_ecd = encode_str_use_dir(fd_to, fd_dir)
rewrite_path = './process_ret/re_code_all.csv'
taskid = read_taskid(f_path,0)
rewrite_all_data = []
rewrite_all_data.append(taskid)
rewrite_all_data.append(db_from_ecd)
rewrite_all_data.append(tb_from_ecd)
rewrite_all_data.append(fd_from_ecd)
rewrite_all_data.append(db_to_ecd)
rewrite_all_data.append(tb_to_ecd)
rewrite_all_data.append(fd_to_ecd)
header = ['task id', "db_from_ecd", "tb_from_ecd", "fd_from_ecd", "db_to_ecd", "tb_to_ecd", "fd_to_ecd"]
write_lists_to_csv_columns(rewrite_all_data, header, rewrite_path)
print(f"db dict: {len(db_dir)}, tb dict: {len(tb_dir)}, fd dict: {len(fd_dir)}")
print(f"FROM : db : {len(db_from_ecd)}, tb : {len(tb_from_ecd)}, fd : {len(fd_from_ecd)}")
print(f"FROM : db uni : {len(set(db_from_ecd))}, tb uni: {len(set(tb_from_ecd))}, fd uni: {len(set(fd_from_ecd))}")
print(f"TO : db : {len(db_to_ecd)}, tb : {len(tb_to_ecd)}, fd : {len(fd_to_ecd)}")
print(f"TO : db uni : {len(set(db_to_ecd))}, tb uni: {len(set(tb_to_ecd))}, fd uni: {len(set(fd_to_ecd))}")
# db_ecd, db_dict = encode_list(db_from + db_to)
# tb_ecd, tb_dict = encode_list(tb_from + tb_to)
# fd_ecd, fd_dict = encode_list(fd_from + fd_to)
# save_dict_to_csv(db_dict, './process_ret/db_dict.csv')
# save_dict_to_csv(tb_dict, './process_ret/tb_dict.csv')
# save_dict_to_csv(fd_dict, './process_ret/fd_dict.csv')
# print(f"all : {len(column_data_from)}, db : {len(db_ecd)}, tb : {len(tb_ecd)}, fd : {len(fd_ecd)}")
# print(f"all : {len(column_data_from)}, db uni : {len(set(db_ecd))}, tb uni: {len(set(tb_ecd))}, fd uni: {len(set(fd_ecd))}")
# print(f"all : {len(column_data_from)}, db dict: {len(db_dict)}, tb dict: {len(tb_dict)}, fd dict: {len(fd_dict)}")
process
import pandas as pd
import csv
# 处理tb
def save_tb_dict_to_csv_with_degree(filename, dictionary):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['tb_code','tb_num', 'out', 'in'])
for key, value in dictionary.items():
data = []
data.append(key)
data.extend(value)
# print(data)
writer.writerow(data)
# 处理db
def save_dict_to_csv_with_degree(filename, dictionary):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['db_code','tb_num', 'out', 'in'])
for key, value in dictionary.items():
data = []
data.append(key)
data.extend(value)
print(data)
writer.writerow(data)
def save_dict_to_csv(filename, dictionary):
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for key, value in dictionary.items():
writer.writerow([key, value])
def save_dict(save_name, dict_save):
# 设置CSV文件路径
csv_file = save_name
# 将字典保存为CSV文件
with open(csv_file, 'w', newline='') as file:
writer = csv.writer(file)
# 写入表头
writer.writerow(dict_save.keys())
# 写入数据
writer.writerow(dict_save.values())
print("CSV文件已保存为:", csv_file)
def read_data(path):
# 读取CSV文件
df = pd.read_csv(path)
# 将每一列数据保存在列表中
column_data = []
for column in df.columns:
column_data.append(df[column].tolist())
# print(len(column_data))
# print(len(column_data[0]))
# print(column_data[0][0])
# 显示列表中的数据(可选)
# for col_data in column_data:
# print(len(col_data))
return column_data
def sta_tb_num_in_db(path):
data = read_data(path)
db_from, tb_from, db_to, tb_to = data[1], data[2], data[4], data[5]
print(f"uni db 0 :{db_from.count(0)}")
db_all = db_from + db_to
tb_all = tb_from + tb_to
# db_from.extend(db_to)
# tb_from.extend(tb_to)
tb_in_db = {}
# tb_in_db['db_code'] = ['tb_num','out','in']
uni_tb_db = {}
db_out = {}
db_in = {}
for d_o, d_i in zip(db_from, db_to):
if d_o not in db_out:
db_out[d_o] = 1
else:
db_out[d_o] += 1
if d_i not in db_in:
db_in[d_i] = 1
else:
db_in[d_i] += 1
print(f"len of out : {len(db_out)}")
print(f"len of in : {len(db_in)}")
for relation in zip(db_all, tb_all):
if relation not in uni_tb_db:
uni_tb_db[relation] = relation[0]
print(len(uni_tb_db))
# print(uni_tb_db)
for key, value in uni_tb_db.items():
if value not in tb_in_db:
tb_in_db[value] = 1
else:
tb_in_db[value] += 1
print(len(tb_in_db))
print(tb_in_db)
# 解决某些库没有入度或者出度的情况
for key, value in tb_in_db.items():
if key not in db_out:
db_out[key] = 0
if key not in db_in:
db_in[key] = 0
print(f"len of out : {len(db_out)}")
print(f"len of in : {len(db_in)}")
merge_dict = {key: [tb_in_db[key], db_out[key], db_in[key]] for key in tb_in_db}
# save_dict_to_csv('./process_ret/tb_nums_in_db.csv', tb_in_db)
save_dict_to_csv_with_degree('./process_ret/tb_nums_in_db_degree.csv', merge_dict)
def sta_fd_num_in_tb(path):
data = read_data(path)
db_from, tb_from, fd_from, db_to, tb_to, fd_to = data[1], data[2], data[3], data[4], data[5], data[6]
tb_out = {}
tb_in = {}
for tb_o in zip(db_from, tb_from):
d, t = tb_o
key = str(d) + "__" + str(t)
if key not in tb_out:
tb_out[key] = 1
else:
tb_out[key] += 1
for tb_i in zip(db_to, tb_to):
d, t = tb_i
key = str(d) + "__" + str(t)
if key not in tb_in:
tb_in[key] = 1
else:
# print("esle")
tb_in[key] += 1
db_all = db_from + db_to
tb_all = tb_from + tb_to
fd_all = fd_from + fd_to
# print(tb_in)
print(f"len of out : {len(tb_out)}")
print(f"len of in : {len(tb_in)}")
fb_in_tb = {}
# fb_in_tb['tb_code'] = 'fb_num'
uni_tb_db = {}
for relation in zip(db_all, tb_all, fd_all):
if relation not in uni_tb_db:
uni_tb_db[relation] =str(relation[0])+"__"+str(relation[1])
# uni_tb_db[relation] = relation[1]
print(len(uni_tb_db))
# print(len(fd_all))
# print(uni_tb_db)
for key, value in uni_tb_db.items():
if value not in fb_in_tb:
fb_in_tb[value] = 1
else:
fb_in_tb[value] += 1
for key, value in fb_in_tb.items():
if key not in tb_out:
tb_out[key] = 0
if key not in tb_in:
tb_in[key] = 0
print(len(fb_in_tb))
print(f"len of out : {len(tb_out)}")
print(f"len of in : {len(tb_in)}")
merge_dict = {key: [fb_in_tb[key], tb_out[key], tb_in[key]] for key in fb_in_tb}
# print(fb_in_tb)
save_tb_dict_to_csv_with_degree('./process_ret/fb_nums_in_tb_degree.csv', merge_dict)
# save_dict_to_csv('./process_ret/fb_nums_in_tb.csv', fb_in_tb)
path = './process_ret/re_code_all.csv'
sta_tb_num_in_db(path)
# sta_fd_num_in_tb(path)
search graph
import pandas as pd
import csv
def read_data(path):
# 读取CSV文件
df = pd.read_csv(path)
# 将每一列数据保存在列表中
column_data = []
for column in df.columns:
column_data.append(df[column].tolist())
return column_data
def get_fd(path):
data = read_data(path)
fd_from = [fd for fd in zip(data[1],data[2],data[3])]
fd_to = [fd for fd in zip(data[4], data[5], data[6])]
fd_from_uni = set(fd_from)
fd_to_uni = set(fd_to)
start = (0,0,0)
end = (1,1,1)
edges = []
# print(f"len of fd all :{len(fd_from),len(fd_to)}, len of fd uni :{len(fd_from_uni),len(fd_to_uni)}")
return fd_from,fd_to,fd_from_uni,fd_to_uni
def cons_graph(path):
graph = {}
fd_from, fd_to, fd_from_uni, fd_to_uni = get_fd(path)
fd_all = fd_from + fd_to
print(f"len of all fd : {len(fd_all)}")
for fd in fd_all:
d, t, f = fd
key = str(d)+"_"+str(t)+"_"+str(f)
if key not in graph:
graph[key] = []
print(f"len of uni fd : {len(graph)}")
for fd_f, fd_t in zip(fd_from,fd_to):
d, t, f = fd_f
key = str(d)+"_"+str(t)+"_"+str(f)
d1, t1, f1 = fd_t
fd_2 = str(d1) + "_" + str(t1) + "_" + str(f1)
graph[key].append(fd_2)
return graph
# def dfs(graph, start, end, path=[], paths=[]):
# path = path + [start]
# if start == end:
# paths.append(path)
# if start not in graph:
# return paths
# for node in graph[start]:
# if node not in path:
# paths = dfs(graph, node, end, path, paths)
# return paths
def dfs(graph, start, path=[], paths=[]):
path = path + [start]
if start not in graph:
return paths
for node in graph[start]:
if node not in path:
paths = dfs(graph, node, path, paths)
paths.append(path) # 添加起点到当前节点的路径
return paths
def save_graph(graph, filename):
with open(filename, 'w') as f:
for node, neighbors in graph.items():
f.write(f"{node}: {', '.join(neighbors)}\n")
print(f"Graph data saved to {filename}")
def load_graph(filename):
graph = {}
with open(filename, 'r') as f:
for line in f:
node, neighbors = line.strip().split(':')
neighbors = neighbors.split(',')
graph[node] = neighbors
return graph
path = './process_ret/re_code_all.csv'
g_save = './process_ret/graph.txt'
G = cons_graph(path)
save_graph(G,g_save)
start = '8_51_67'
end = '10_3616_10974'
# PATH = dfs(G,start)
# print(PATH)
# print(g)
# for data in g.items():
# key,v = data
# if(len(v) != 0):
# print(key)