利用DeepSeek编写验证xlsx格式文件中是否启用sharedStrings.xml对读写效率影响python程序
让他分别用100*10000个单元格有100、1000、10000个不同的1-200字符长的大写英文字母字符串测试.
一开始DeepSeek没有找到启用sharedStrings.xml的写xlsx模块,我自己找了pyxlsbwriter的例子告诉他才改好的。
import osimport timeimport randomimport stringimport pandas as pdfrom openpyxl import Workbookfrom openpyxl.utils.dataframe import dataframe_to_rowsimport matplotlib.pyplot as pltimport numpy as npfrom pyxlsbwriter import XlsxWriterdef generate_random_string(length): \"\"\"生成指定长度的大写英文字母随机字符串\"\"\" return \'\'.join(random.choices(string.ascii_uppercase, k=length))def create_test_data(rows, cols, unique_strings): \"\"\" 创建测试数据 rows: 行数 cols: 列数 unique_strings: 不同字符串的数量 \"\"\" # 生成唯一字符串池 string_pool = [generate_random_string(random.randint(1, 200)) for _ in range(unique_strings)] # 创建数据矩阵 data = [] for i in range(rows): row = [] for j in range(cols): # 从字符串池中随机选择一个字符串 row.append(random.choice(string_pool)) data.append(row) return datadef write_excel_with_shared_strings(data, filename): \"\"\"使用shared strings写入Excel文件(使用pyxlsbwriter库确保生成sharedStrings.xml)\"\"\" # 使用XlsxWriter创建xlsx文件,默认会使用shared strings with XlsxWriter(filename, compressionLevel=6) as writer: writer.add_sheet(\"Sheet1\") writer.write_sheet(data) def write_excel_without_shared_strings(data, filename): \"\"\"不使用shared strings写入Excel文件(转换为pandas DataFrame再保存)\"\"\" df = pd.DataFrame(data) df.to_excel(filename, index=False, header=False)def read_excel(filename): \"\"\"读取Excel文件并测量时间\"\"\" start_time = time.time() df = pd.read_excel(filename, header=None) end_time = time.time() return end_time - start_time, dfdef test_scenario(rows, cols, unique_strings, output_dir=\"test_results\"): \"\"\"测试一个场景:特定行、列和唯一字符串数量的情况\"\"\" if not os.path.exists(output_dir): os.makedirs(output_dir) # 创建测试数据 print(f\"生成测试数据: {rows}行 x {cols}列, {unique_strings}个唯一字符串\") data = create_test_data(rows, cols, unique_strings) # 测试启用shared strings的情况 with_shared_file = os.path.join(output_dir, f\"with_shared_{rows}_{cols}_{unique_strings}.xlsx\") start_time = time.time() write_excel_with_shared_strings(data, with_shared_file) with_shared_write_time = time.time() - start_time with_shared_read_time, with_shared_df = read_excel(with_shared_file) with_shared_size = os.path.getsize(with_shared_file) # 测试不启用shared strings的情况 without_shared_file = os.path.join(output_dir, f\"without_shared_{rows}_{cols}_{unique_strings}.xlsx\") start_time = time.time() write_excel_without_shared_strings(data, without_shared_file) without_shared_write_time = time.time() - start_time without_shared_read_time, without_shared_df = read_excel(without_shared_file) without_shared_size = os.path.getsize(without_shared_file) # 验证数据一致性 assert with_shared_df.equals(without_shared_df), \"两种方式保存的数据不一致!\" # 返回结果 return { \'rows\': rows, \'cols\': cols, \'unique_strings\': unique_strings, \'with_shared_write_time\': with_shared_write_time, \'with_shared_read_time\': with_shared_read_time, \'with_shared_size\': with_shared_size, \'without_shared_write_time\': without_shared_write_time, \'without_shared_read_time\': without_shared_read_time, \'without_shared_size\': without_shared_size }def main(): \"\"\"主函数\"\"\" # 测试配置 rows = 100 cols = 10000 unique_strings_list = [100, 1000, 10000, 100000] results = [] # 运行测试 for unique_strings in unique_strings_list: result = test_scenario(rows, cols, unique_strings) results.append(result) # 打印当前测试结果 print(f\"\\n测试结果 (唯一字符串数: {unique_strings}):\") print(f\"启用shared strings - 写入时间: {result[\'with_shared_write_time\']:.2f}s, \" f\"读取时间: {result[\'with_shared_read_time\']:.2f}s, \" f\"文件大小: {result[\'with_shared_size\']/1024/1024:.2f}MB\") print(f\"禁用shared strings - 写入时间: {result[\'without_shared_write_time\']:.2f}s, \" f\"读取时间: {result[\'without_shared_read_time\']:.2f}s, \" f\"文件大小: {result[\'without_shared_size\']/1024/1024:.2f}MB\") # 绘制结果图表 plot_results(results)def plot_results(results): from pylab import mpl # 设置显示中文字体 mpl.rcParams[\"font.sans-serif\"] = [\"SimSun\"] # 设置正常显示符号 mpl.rcParams[\"axes.unicode_minus\"] = False \"\"\"绘制测试结果图表\"\"\" unique_strings = [r[\'unique_strings\'] for r in results] # 创建图表 fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10)) # 写入时间对比 with_shared_write_times = [r[\'with_shared_write_time\'] for r in results] without_shared_write_times = [r[\'without_shared_write_time\'] for r in results] ax1.plot(unique_strings, with_shared_write_times, \'o-\', label=\'启用shared strings\') ax1.plot(unique_strings, without_shared_write_times, \'o-\', label=\'禁用shared strings\') ax1.set_xlabel(\'唯一字符串数量\') ax1.set_ylabel(\'写入时间 (秒)\') ax1.set_title(\'写入时间对比\') ax1.legend() ax1.grid(True) # 读取时间对比 with_shared_read_times = [r[\'with_shared_read_time\'] for r in results] without_shared_read_times = [r[\'without_shared_read_time\'] for r in results] ax2.plot(unique_strings, with_shared_read_times, \'o-\', label=\'启用shared strings\') ax2.plot(unique_strings, without_shared_read_times, \'o-\', label=\'禁用shared strings\') ax2.set_xlabel(\'唯一字符串数量\') ax2.set_ylabel(\'读取时间 (秒)\') ax2.set_title(\'读取时间对比\') ax2.legend() ax2.grid(True) # 文件大小对比 with_shared_sizes = [r[\'with_shared_size\']/1024/1024 for r in results] without_shared_sizes = [r[\'without_shared_size\']/1024/1024 for r in results] ax3.plot(unique_strings, with_shared_sizes, \'o-\', label=\'启用shared strings\') ax3.plot(unique_strings, without_shared_sizes, \'o-\', label=\'禁用shared strings\') ax3.set_xlabel(\'唯一字符串数量\') ax3.set_ylabel(\'文件大小 (MB)\') ax3.set_title(\'文件大小对比\') ax3.legend() ax3.grid(True) # 总时间对比 with_shared_total_times = [r[\'with_shared_write_time\'] + r[\'with_shared_read_time\'] for r in results] without_shared_total_times = [r[\'without_shared_write_time\'] + r[\'without_shared_read_time\'] for r in results] ax4.plot(unique_strings, with_shared_total_times, \'o-\', label=\'启用shared strings\') ax4.plot(unique_strings, without_shared_total_times, \'o-\', label=\'禁用shared strings\') ax4.set_xlabel(\'唯一字符串数量\') ax4.set_ylabel(\'总时间 (秒)\') ax4.set_title(\'总时间 (写入+读取) 对比\') ax4.legend() ax4.grid(True) plt.tight_layout() plt.savefig(\'shared_strings_performance_comparison.png\', dpi=300) plt.show() # 打印详细结果表格 print(\"\\n详细测试结果:\") print(\"唯一字符串数 | 启用shared写入时间 | 禁用shared写入时间 | 启用shared读取时间 | 禁用shared读取时间 | 启用shared文件大小 | 禁用shared文件大小\") for r in results: print(f\"{r[\'unique_strings\']:>12} | {r[\'with_shared_write_time\']:>17.2f} | {r[\'without_shared_write_time\']:>17.2f} | \" f\"{r[\'with_shared_read_time\']:>17.2f} | {r[\'without_shared_read_time\']:>17.2f} | \" f\"{r[\'with_shared_size\']/1024/1024:>17.2f} | {r[\'without_shared_size\']/1024/1024:>17.2f}\")if __name__ == \"__main__\": main()
执行结果
生成测试数据: 100行 x 10000列, 100个唯一字符串测试结果 (唯一字符串数: 100):启用shared strings - 写入时间: 0.91s, 读取时间: 2.70s, 文件大小: 1.60MB禁用shared strings - 写入时间: 10.37s, 读取时间: 9.21s, 文件大小: 12.16MB生成测试数据: 100行 x 10000列, 1000个唯一字符串测试结果 (唯一字符串数: 1000):启用shared strings - 写入时间: 0.89s, 读取时间: 2.94s, 文件大小: 2.24MB禁用shared strings - 写入时间: 11.71s, 读取时间: 9.52s, 文件大小: 53.34MB生成测试数据: 100行 x 10000列, 10000个唯一字符串测试结果 (唯一字符串数: 10000):启用shared strings - 写入时间: 0.85s, 读取时间: 2.97s, 文件大小: 3.29MB禁用shared strings - 写入时间: 12.11s, 读取时间: 9.60s, 文件大小: 64.52MB生成测试数据: 100行 x 10000列, 100000个唯一字符串测试结果 (唯一字符串数: 100000):启用shared strings - 写入时间: 3.10s, 读取时间: 3.77s, 文件大小: 9.54MB禁用shared strings - 写入时间: 12.65s, 读取时间: 9.96s, 文件大小: 66.06MB
结果显示禁用shared strings时写入更慢且文件更大,而启用时文件更小且写入更快,随着唯一字符串数量增加,启用shared strings的效率下降,不启用没有变化。验证了shared strings在重复字符串场景下的存储优化效果。