master

分支 (1)

管理

管理

master

short_script
/
20210311_dataset_convertion_spss.py

import os
import pandas as pd
import numpy as np
import re

# file location
# 设置文件名称，后面会自动生成文件路径以及转换后的文件路径
data_file_name = 'data7_3.tsv'
# 设置数据的列数
column_num = 2

data_file_fname, data_file_extname = os.path.splitext(data_file_name)
file_path = os.path.join('data_src',data_file_name)
# output file location
output_path = os.path.join('data_src',f'{data_file_fname}_conv.csv')
# column number
# 每个格子里面有没有换行符
newline_symbol_included = True

reNewLine = re.compile('(.*?)\n')


def remove_newline_symbol(e:str):
    mo = reNewLine.match(e)
    if mo:
        return mo.group(1)
    else:
        return e


with open(file_path,encoding='utf') as _file:
    line_list = _file.readlines()
    if newline_symbol_included:
        line_list = [remove_newline_symbol(e) for e in line_list]

# header should be in [:column_num], the rest should be contents
header = line_list[:column_num]
content = line_list[column_num:]
rows = len(content)/column_num
assert rows%1==0, f'rows should be integer:{rows}'
rows = int(rows)
content = np.array(content).reshape(rows,column_num)


def compose_dataframe(header,content):
    content_t = content.view().T
    data_dict = {}
    for i,e in enumerate(header):
        data_dict[e]=content_t[i]
    return pd.DataFrame(data_dict)


rst_df = compose_dataframe(header,content)

# quality check
# is there na value?
print(rst_df.isna().sum())

rst_df.to_csv(output_path,index=False)