代码拉取完成,页面将自动刷新
import os
import pandas as pd
import numpy as np
import re
# file location
# 设置文件名称,后面会自动生成文件路径以及转换后的文件路径
data_file_name = 'data7_3.tsv'
# 设置数据的列数
column_num = 2
data_file_fname, data_file_extname = os.path.splitext(data_file_name)
file_path = os.path.join('data_src',data_file_name)
# output file location
output_path = os.path.join('data_src',f'{data_file_fname}_conv.csv')
# column number
# 每个格子里面有没有换行符
newline_symbol_included = True
reNewLine = re.compile('(.*?)\n')
def remove_newline_symbol(e:str):
mo = reNewLine.match(e)
if mo:
return mo.group(1)
else:
return e
with open(file_path,encoding='utf') as _file:
line_list = _file.readlines()
if newline_symbol_included:
line_list = [remove_newline_symbol(e) for e in line_list]
# header should be in [:column_num], the rest should be contents
header = line_list[:column_num]
content = line_list[column_num:]
rows = len(content)/column_num
assert rows%1==0, f'rows should be integer:{rows}'
rows = int(rows)
content = np.array(content).reshape(rows,column_num)
def compose_dataframe(header,content):
content_t = content.view().T
data_dict = {}
for i,e in enumerate(header):
data_dict[e]=content_t[i]
return pd.DataFrame(data_dict)
rst_df = compose_dataframe(header,content)
# quality check
# is there na value?
print(rst_df.isna().sum())
rst_df.to_csv(output_path,index=False)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。