最近需要删除word中的广告文字,删除页眉页脚,删除换行符,重命名文件的名字,这些复杂重复的操作。只要一个python脚本轻松解决。下面是写的脚本内容,方便后期查阅修改。附件生成了exe文件,直接选择文件路径进行处理即可完成。保留在这里方便后期查阅。
import os import win32com.client from docx import Document import re import shutil import zipfile def unzip_all_archives(folder_path): for file in os.listdir(folder_path): if file.lower().endswith(".zip"): file_path = os.path.join(folder_path, file) try: with zipfile.ZipFile(file_path, 'r') as zip_ref: for member in zip_ref.infolist(): # 尝试修正中文乱码:原始文件名是 cp437 → 解码成 gbk try: corrected_name = member.filename.encode('cp437').decode('gbk') except UnicodeDecodeError: corrected_name = member.filename # fallback target_path = os.path.join(folder_path, corrected_name) # 确保目录存在 if member.is_dir(): os.makedirs(target_path, exist_ok=True) else: os.makedirs(os.path.dirname(target_path), exist_ok=True) with open(target_path, 'wb') as f: f.write(zip_ref.read(member)) print(f"[解压] 已解压(含中文修复):{file}") except Exception as e: print(f"[错误] 解压失败:{file} - {e}") def convert_doc_to_docx(doc_path): word = win32com.client.Dispatch("Word.Application") word.Visible = False doc = word.Documents.Open(doc_path) docx_path = doc_path + "x" doc.SaveAs(docx_path, FileFormat=16) # 清除页眉页脚 for section in doc.Sections: for hdr in [section.Headers(1), section.Footers(1)]: try: # 清除内容 hdr.Range.Delete() # 移除段落边框线(横线) for para in hdr.Range.Paragraphs: borders = para.Borders for i in range(1, 7): # 1~6:上下左右及内外边框 borders(i).LineStyle = 0 # 0 = wdLineStyleNone # 清除页脚页码字段(如果有) fields = hdr.Range.Fields for f in reversed(range(fields.Count)): try: fields.Item(f + 1).Delete() except: pass except Exception as e: print(f"[警告] 清除页眉/页脚样式失败:{e}") doc.Close() word.Quit() print(f"[转换+清除页眉页脚] 成功:{os.path.basename(docx_path)}") return docx_path def delete_paragraphs(doc, indices): for i in sorted(indices, reverse=True): doc._body._element.remove(doc.paragraphs[i]._element) def delete_empty_paragraphs_above_title(doc, keyword="合同"): for i, para in enumerate(doc.paragraphs): if keyword in para.text.strip(): to_delete = [] for j in range(i - 1, -1, -1): if doc.paragraphs[j].text.strip() == "": to_delete.append(j) else: break delete_paragraphs(doc, to_delete) print(f"[处理] 删除标题上方空段落:{len(to_delete)} 段") break def delete_single_empty_paragraph_above_title(doc, keywords=("合同", "协议")): for i, para in enumerate(doc.paragraphs): if any(k in para.text for k in keywords): if i > 0 and doc.paragraphs[i - 1].text.strip() == "": delete_paragraphs(doc, [i - 1]) print("[处理] 删除标题前的 1 个空段落") else: print("[跳过] 标题上方无空段落") break def delete_after_title(doc, keywords=("合同", "协议", "书"), lines=6): for i, para in enumerate(doc.paragraphs): text = para.text.strip() if any(keyword in text for keyword in keywords): delete_paragraphs(doc, list(range(i + 1, i + 1 + lines))) print(f"[处理] 删除标题“{text}”下方的 {lines} 行") break def delete_trailing_ad_and_blank_pages(doc): keywords = ["法宝", "pkulaw", "下载日期", "原文链接", "扫描二维码", "©", "https://", "www."] total = len(doc.paragraphs) to_delete = [] for i in range(total - 1, -1, -1): para = doc.paragraphs[i] text = para.text.strip() if text == "": to_delete.append(i) continue if any(k in text for k in keywords): to_delete.append(i) continue if to_delete: break if to_delete: delete_paragraphs(doc, to_delete) print(f"[处理] 删除尾部广告/空段落:{len(to_delete)} 段") def process_docx(docx_path, output_path): doc = Document(docx_path) delete_single_empty_paragraph_above_title(doc) delete_empty_paragraphs_above_title(doc) delete_after_title(doc, keywords=("合同", "协议", "书"), lines=6) delete_trailing_ad_and_blank_pages(doc) doc.save(output_path) print(f"[完成] 保存为:{os.path.basename(output_path)}") def delete_processed_doc_folder(folder_path): processed_folder = os.path.join(folder_path, "已处理原文件") if os.path.exists(processed_folder): try: shutil.rmtree(processed_folder) print(f"[清理] 已删除文件夹:{processed_folder}") except Exception as e: print(f"[错误] 删除文件夹失败:{e}") else: print("[跳过] 没有已处理原文件夹,无需删除") def delete_unnumbered_docx_files(folder_path): pattern = re.compile(r"^\d+_.*\.docx$", re.IGNORECASE) for file in os.listdir(folder_path): if file.lower().endswith(".docx") and not pattern.match(file): try: os.remove(os.path.join(folder_path, file)) print(f"[清理] 删除未编号文件:{file}") except Exception as e: print(f"[错误] 删除 {file} 失败:{e}") def process_all_docs_in_folder(folder_path): unzip_all_archives(folder_path) # ✅ 自动解压所有 zip processed_folder = os.path.join(folder_path, "已处理原文件") os.makedirs(processed_folder, exist_ok=True) counter = 1 # 从1开始编号 for file in os.listdir(folder_path): if file.lower().endswith(".doc") and not file.lower().endswith(".docx"): full_path = os.path.join(folder_path, file) try: # 转换为 docx 并清除页眉页脚 docx_path = convert_doc_to_docx(full_path) # 构造简洁输出名(保留前缀)并加编号 prefix = file.split("(")[0].strip() output_file = f"{counter}_{prefix}.docx" output_path = os.path.join(folder_path, output_file) process_docx(docx_path, output_path) # 移动原始 .doc 文件 os.rename(full_path, os.path.join(processed_folder, file)) print(f"[移动] 已移动原始文件至:{processed_folder}") counter += 1 # 编号递增 except Exception as e: print(f"[错误] 处理 {file} 失败:{e}") delete_unnumbered_docx_files(folder_path) print("[完成] 所有文件处理完成,已清理未编号 .docx 文件") delete_processed_doc_folder(folder_path) # ✅ 删除已处理原文件夹 print("[完成] 所有文件处理完成,环境已清理") delete_zip_files(folder_path) # ✅ 删除所有 zip def delete_zip_files(folder_path): for file in os.listdir(folder_path): if file.lower().endswith(".zip"): try: os.remove(os.path.join(folder_path, file)) print(f"[清理] 已删除压缩包:{file}") except Exception as e: print(f"[错误] 删除压缩包失败:{file} - {e}") # ========= 修改为你的文件夹路径 ========= folder = r"D:\U盘备份资料\合同范本\7建设工程合同 (712)\2工程设计合同 (41)" # ← 替换为你自己的路径 process_all_docs_in_folder(folder)
- 图形界面选择文件夹;
- 自动解压 .zip(含中文修复);
- 自动转换 .doc 为 .docx,清除页眉、横线、页码;
- 删除标题上方空行、标题下冗余段;
- 删除尾页广告页、空白页;
- 自动编号重命名;
- 删除 .zip、原始 .doc、未编号 .docx;
- 打开处理结果目录!
声明:
本站所有文章,如无特殊说明或标注,均为本站原创发布。
任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。
如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。
本站所有文章,如无特殊说明或标注,均为本站原创发布。
任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。
如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。