代码拉取完成,页面将自动刷新
import os
import shutil
import zipfile
import pandas as pd
import urllib.parse as ups
import snoop
import re
# 问卷数据下载后是xls文件, 附件压缩包是zip文件, 两者和代码放在同一个文件夹中
# 转换工作文件夹,列出当前文件夹下面所有的文件
cwd = r"D:\SynologyDrive\护理学院工作\教学\spss软件实验\2020备课\practice2\药学"
os.chdir(cwd)
filelist = os.listdir()
# 下面分别是获取xls文件和zip文件的文件名
[xlsfile] = [e for e in filelist if os.path.splitext(e)[1] == '.xlsx']
[zipfilepath] = [e for e in filelist if os.path.splitext(e)[1] == '.zip']
# 先将zip包解压缩
# 首先使用zipfile库读取附件文件, 建立ZipFile对象
zip_ref = zipfile.ZipFile(zipfilepath, 'r')
# 为了避免命名出现错误,这里手动进行文件名的转码
targetEncode = 'gb2312'
nl = zip_ref.namelist()
# 解压缩以前先新建文件夹, 文件夹名称和压缩包的文件名相同
unzip_dir = os.path.splitext(zipfilepath)[0]
if os.path.exists(unzip_dir):
print("好像已经解压缩过了哦?")
else:
os.mkdir(unzip_dir)
# 对于一些特殊文件夹在这里先过滤掉
nl = [e for e in nl if '__' not in e]
# 手动遍历所有文件,然后对文件名进行转码
for fn in nl:
correct_fn = fn.encode('cp437').decode(targetEncode)
correct_fn = os.path.join(unzip_dir, correct_fn)
with open(correct_fn, 'wb') as output_file:
with zip_ref.open(fn, 'r') as original_file:
shutil.copyfileobj(original_file, output_file)
# zip文档内容,是文件名列表
zip_content = os.listdir(unzip_dir)
# 然后是读取Excel文件
# 声明filename extract函数
reIdNumberName_dep = re.compile(r"(\d+)_(\d+)_(.*)")
reIdNumber = re.compile(r"(\d+)_(.*)")
def fileNameExtractor(row,url_header:str,name_header:str):
# get the query string, like: path=http%3a%2f%2fpubuserqiniu.paperol.cn%2f37299324_1_q4_pDJ5pCCKECzTxAdqQLyuw.docx%3fattname%3d1_4_%25e5%25ae%259e%25e9%25aa%258c%25e6%258a%25a5%25e5%2591%258a.docx&activity=37299324
urlStr = row[url_header]
query1 = ups.urlparse(urlStr).query
# get the path string, like: http://pubuserqiniu.paperol.cn/37299324_1_q4_pDJ5pCCKECzTxAdqQLyuw.docx?attname=1_4_%e5%ae%9e%e9%aa%8c%e6%8a%a5%e5%91%8a.docx
activityStr = ups.parse_qs(query1)['activity'][0]
pathStr = ups.parse_qs(query1)['path'][0]
# parse the path string again to get query string
query2 = ups.urlparse(pathStr).query
# now the query string is like attname=1_4_%e5%ae%9e%e9%aa%8c%e6%8a%a5%e5%91%8a.docx
# parse_qs this string to get the encoded string of file
attname = ups.parse_qs(query2)['attname'][0]
snoop.pp(attname)
mo = reIdNumber.match(attname)
if mo is None:
print(f"{attname} 格式不对")
# conv_filename = f"序号{mo.group(1)}_{row[name_header]}_{mo.group(2)}_{mo.group(3)}"
file_name_prefix = re.compile(f"序号{mo.group(1)}_.*")
[conv_filename] = [e for e in zip_content if file_name_prefix.match(e)]
# 最后是从url中提取了问卷的activity Id和对应的上传文件文件名, 然后按照Excel中hyperlink公式的格式形成公式
# rst = '=HYPERLINK("./{}_附件/{}","作业链接")'.format(activityStr, docxFileName)
return f'=HYPERLINK("./{activityStr}_附件/{conv_filename}","作业链接")'
xl = pd.ExcelFile(xlsfile)
df_xl = xl.parse('Sheet1', dtype=str)
# 目前的问卷自动带有题号,题号后面会紧跟着文件名
reXingming = re.compile('姓名')
reShangchuan = re.compile('上传')
columns = list(df_xl.columns)
name_header = [e for e in columns if reXingming.search(e) is not None]
assert len(name_header)>0, "未发现姓名字样"
assert len(name_header)==1, "多重项包含姓名字样"
name_header = name_header[0]
upload_headers = [e for e in columns if reShangchuan.search(e) is not None]
link_headers = [e.replace('上传','链接:') for e in upload_headers]
assert len(upload_headers)>0, "未发现上传字样"
reUploadNumber = re.compile(r'(\d+)[、,\s]')
for i,u in enumerate(upload_headers):
df_xl[link_headers[i]] = df_xl.apply(lambda r:fileNameExtractor(r,u,name_header),axis=1)
# '序号','提交答卷时间','所用时间', 0:3
# 6:到结束, 然后drop掉原来的url列
df_rst = df_xl[columns[0:3]+columns[6:]+link_headers]
df_rst = df_rst.drop(columns=upload_headers)
# 设置输出文件的名称
output_xls = os.path.splitext(xlsfile)[0] + "_conv.xlsx"
# 建立excel writer对象
writer = pd.ExcelWriter(output_xls, engine='xlsxwriter')
# 输出
df_rst.to_excel(writer, 'Sheet1')
# 保存输出excel文件
writer.save()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。