最近需要删除word中的广告文字,删除页眉页脚,删除换行符,重命名文件的名字,这些复杂重复的操作。只要一个python脚本轻松解决。下面是写的脚本内容,方便后期查阅修改。附件生成了exe文件,直接选择文件路径进行处理即可完成。保留在这里方便后期查阅。
import os
import win32com.client
from docx import Document
import re
import shutil
import zipfile
def unzip_all_archives(folder_path):
for file in os.listdir(folder_path):
if file.lower().endswith(".zip"):
file_path = os.path.join(folder_path, file)
try:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
for member in zip_ref.infolist():
# 尝试修正中文乱码:原始文件名是 cp437 → 解码成 gbk
try:
corrected_name = member.filename.encode('cp437').decode('gbk')
except UnicodeDecodeError:
corrected_name = member.filename # fallback
target_path = os.path.join(folder_path, corrected_name)
# 确保目录存在
if member.is_dir():
os.makedirs(target_path, exist_ok=True)
else:
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with open(target_path, 'wb') as f:
f.write(zip_ref.read(member))
print(f"[解压] 已解压(含中文修复):{file}")
except Exception as e:
print(f"[错误] 解压失败:{file} - {e}")
def convert_doc_to_docx(doc_path):
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
doc = word.Documents.Open(doc_path)
docx_path = doc_path + "x"
doc.SaveAs(docx_path, FileFormat=16)
# 清除页眉页脚
for section in doc.Sections:
for hdr in [section.Headers(1), section.Footers(1)]:
try:
# 清除内容
hdr.Range.Delete()
# 移除段落边框线(横线)
for para in hdr.Range.Paragraphs:
borders = para.Borders
for i in range(1, 7): # 1~6:上下左右及内外边框
borders(i).LineStyle = 0 # 0 = wdLineStyleNone
# 清除页脚页码字段(如果有)
fields = hdr.Range.Fields
for f in reversed(range(fields.Count)):
try:
fields.Item(f + 1).Delete()
except:
pass
except Exception as e:
print(f"[警告] 清除页眉/页脚样式失败:{e}")
doc.Close()
word.Quit()
print(f"[转换+清除页眉页脚] 成功:{os.path.basename(docx_path)}")
return docx_path
def delete_paragraphs(doc, indices):
for i in sorted(indices, reverse=True):
doc._body._element.remove(doc.paragraphs[i]._element)
def delete_empty_paragraphs_above_title(doc, keyword="合同"):
for i, para in enumerate(doc.paragraphs):
if keyword in para.text.strip():
to_delete = []
for j in range(i - 1, -1, -1):
if doc.paragraphs[j].text.strip() == "":
to_delete.append(j)
else:
break
delete_paragraphs(doc, to_delete)
print(f"[处理] 删除标题上方空段落:{len(to_delete)} 段")
break
def delete_single_empty_paragraph_above_title(doc, keywords=("合同", "协议")):
for i, para in enumerate(doc.paragraphs):
if any(k in para.text for k in keywords):
if i > 0 and doc.paragraphs[i - 1].text.strip() == "":
delete_paragraphs(doc, [i - 1])
print("[处理] 删除标题前的 1 个空段落")
else:
print("[跳过] 标题上方无空段落")
break
def delete_after_title(doc, keywords=("合同", "协议", "书"), lines=6):
for i, para in enumerate(doc.paragraphs):
text = para.text.strip()
if any(keyword in text for keyword in keywords):
delete_paragraphs(doc, list(range(i + 1, i + 1 + lines)))
print(f"[处理] 删除标题“{text}”下方的 {lines} 行")
break
def delete_trailing_ad_and_blank_pages(doc):
keywords = ["法宝", "pkulaw", "下载日期", "原文链接", "扫描二维码", "©", "https://", "www."]
total = len(doc.paragraphs)
to_delete = []
for i in range(total - 1, -1, -1):
para = doc.paragraphs[i]
text = para.text.strip()
if text == "":
to_delete.append(i)
continue
if any(k in text for k in keywords):
to_delete.append(i)
continue
if to_delete:
break
if to_delete:
delete_paragraphs(doc, to_delete)
print(f"[处理] 删除尾部广告/空段落:{len(to_delete)} 段")
def process_docx(docx_path, output_path):
doc = Document(docx_path)
delete_single_empty_paragraph_above_title(doc)
delete_empty_paragraphs_above_title(doc)
delete_after_title(doc, keywords=("合同", "协议", "书"), lines=6)
delete_trailing_ad_and_blank_pages(doc)
doc.save(output_path)
print(f"[完成] 保存为:{os.path.basename(output_path)}")
def delete_processed_doc_folder(folder_path):
processed_folder = os.path.join(folder_path, "已处理原文件")
if os.path.exists(processed_folder):
try:
shutil.rmtree(processed_folder)
print(f"[清理] 已删除文件夹:{processed_folder}")
except Exception as e:
print(f"[错误] 删除文件夹失败:{e}")
else:
print("[跳过] 没有已处理原文件夹,无需删除")
def delete_unnumbered_docx_files(folder_path):
pattern = re.compile(r"^\d+_.*\.docx$", re.IGNORECASE)
for file in os.listdir(folder_path):
if file.lower().endswith(".docx") and not pattern.match(file):
try:
os.remove(os.path.join(folder_path, file))
print(f"[清理] 删除未编号文件:{file}")
except Exception as e:
print(f"[错误] 删除 {file} 失败:{e}")
def process_all_docs_in_folder(folder_path):
unzip_all_archives(folder_path) # ✅ 自动解压所有 zip
processed_folder = os.path.join(folder_path, "已处理原文件")
os.makedirs(processed_folder, exist_ok=True)
counter = 1 # 从1开始编号
for file in os.listdir(folder_path):
if file.lower().endswith(".doc") and not file.lower().endswith(".docx"):
full_path = os.path.join(folder_path, file)
try:
# 转换为 docx 并清除页眉页脚
docx_path = convert_doc_to_docx(full_path)
# 构造简洁输出名(保留前缀)并加编号
prefix = file.split("(")[0].strip()
output_file = f"{counter}_{prefix}.docx"
output_path = os.path.join(folder_path, output_file)
process_docx(docx_path, output_path)
# 移动原始 .doc 文件
os.rename(full_path, os.path.join(processed_folder, file))
print(f"[移动] 已移动原始文件至:{processed_folder}")
counter += 1 # 编号递增
except Exception as e:
print(f"[错误] 处理 {file} 失败:{e}")
delete_unnumbered_docx_files(folder_path)
print("[完成] 所有文件处理完成,已清理未编号 .docx 文件")
delete_processed_doc_folder(folder_path) # ✅ 删除已处理原文件夹
print("[完成] 所有文件处理完成,环境已清理")
delete_zip_files(folder_path) # ✅ 删除所有 zip
def delete_zip_files(folder_path):
for file in os.listdir(folder_path):
if file.lower().endswith(".zip"):
try:
os.remove(os.path.join(folder_path, file))
print(f"[清理] 已删除压缩包:{file}")
except Exception as e:
print(f"[错误] 删除压缩包失败:{file} - {e}")
# ========= 修改为你的文件夹路径 =========
folder = r"D:\U盘备份资料\合同范本\7建设工程合同 (712)\2工程设计合同 (41)" # ← 替换为你自己的路径
process_all_docs_in_folder(folder)
- 图形界面选择文件夹;
- 自动解压 .zip(含中文修复);
- 自动转换 .doc 为 .docx,清除页眉、横线、页码;
- 删除标题上方空行、标题下冗余段;
- 删除尾页广告页、空白页;
- 自动编号重命名;
- 删除 .zip、原始 .doc、未编号 .docx;
- 打开处理结果目录!

声明:
本站所有文章,如无特殊说明或标注,均为本站原创发布。
任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。
如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。
本站所有文章,如无特殊说明或标注,均为本站原创发布。
任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。
如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。
