PDF 操作工具集
本技能提供 PDF 文档操作的完整指南。
依赖库
bash1pip install reportlab pypdf pdfplumber
1. 创建 PDF (reportlab)
基础文档
python1from reportlab.lib.pagesizes import A4, letter 2from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle 3from reportlab.lib.units import inch, cm 4from reportlab.platypus import ( 5 SimpleDocTemplate, Paragraph, Spacer, Image, 6 Table, TableStyle, PageBreak 7) 8from reportlab.lib import colors 9 10# 创建文档 11doc = SimpleDocTemplate( 12 "output.pdf", 13 pagesize=A4, 14 rightMargin=72, 15 leftMargin=72, 16 topMargin=72, 17 bottomMargin=72 18) 19 20# 获取样式 21styles = getSampleStyleSheet() 22 23# 构建内容 24story = [] 25 26# 添加标题 27story.append(Paragraph("报告标题", styles['Heading1'])) 28story.append(Spacer(1, 12)) 29 30# 添加段落 31story.append(Paragraph("这是正文内容。", styles['Normal'])) 32story.append(Spacer(1, 12)) 33 34# 生成 PDF 35doc.build(story)
添加图片
python1from reportlab.platypus import Image 2 3# 添加图片(指定宽度,保持比例) 4img = Image("chart.png", width=400) 5story.append(img)
添加表格
python1from reportlab.platypus import Table, TableStyle 2from reportlab.lib import colors 3 4data = [ 5 ['标题1', '标题2', '标题3'], 6 ['数据1', '数据2', '数据3'], 7 ['数据4', '数据5', '数据6'], 8] 9 10table = Table(data) 11table.setStyle(TableStyle([ 12 ('BACKGROUND', (0, 0), (-1, 0), colors.grey), 13 ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), 14 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), 15 ('FONTSIZE', (0, 0), (-1, 0), 14), 16 ('BOTTOMPADDING', (0, 0), (-1, 0), 12), 17 ('BACKGROUND', (0, 1), (-1, -1), colors.beige), 18 ('GRID', (0, 0), (-1, -1), 1, colors.black), 19])) 20 21story.append(table)
自定义样式
python1# 自定义段落样式 2custom_style = ParagraphStyle( 3 'CustomStyle', 4 parent=styles['Normal'], 5 fontSize=12, 6 leading=16, 7 spaceAfter=12, 8 textColor=colors.darkblue, 9)
2. 合并 PDF (pypdf)
python1from pypdf import PdfMerger 2 3merger = PdfMerger() 4 5# 添加 PDF 文件 6merger.append("file1.pdf") 7merger.append("file2.pdf") 8merger.append("file3.pdf") 9 10# 写入合并后的文件 11merger.write("merged.pdf") 12merger.close()
3. 拆分 PDF
python1from pypdf import PdfReader, PdfWriter 2 3reader = PdfReader("input.pdf") 4 5# 提取特定页面 6for i, page in enumerate(reader.pages): 7 writer = PdfWriter() 8 writer.add_page(page) 9 with open(f"page_{i+1}.pdf", "wb") as output: 10 writer.write(output)
4. 提取文本 (pdfplumber)
python1import pdfplumber 2 3with pdfplumber.open("input.pdf") as pdf: 4 for page in pdf.pages: 5 text = page.extract_text() 6 print(text)
5. 提取表格
python1import pdfplumber 2 3with pdfplumber.open("input.pdf") as pdf: 4 for page in pdf.pages: 5 tables = page.extract_tables() 6 for table in tables: 7 for row in table: 8 print(row)
6. 完整报告示例
python1from reportlab.lib.pagesizes import A4 2from reportlab.lib.styles import getSampleStyleSheet 3from reportlab.platypus import ( 4 SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak 5) 6from reportlab.lib import colors 7from reportlab.lib.units import inch 8from datetime import datetime 9 10def create_research_report( 11 output_path: str, 12 title: str, 13 summary: str, 14 findings: list, 15 charts: list, 16 data_table: list 17): 18 """创建研究报告 PDF""" 19 20 doc = SimpleDocTemplate(output_path, pagesize=A4) 21 styles = getSampleStyleSheet() 22 story = [] 23 24 # 封面 25 story.append(Spacer(1, 2*inch)) 26 story.append(Paragraph(title, styles['Title'])) 27 story.append(Spacer(1, 0.5*inch)) 28 story.append(Paragraph( 29 f"生成日期: {datetime.now().strftime('%Y-%m-%d')}", 30 styles['Normal'] 31 )) 32 story.append(PageBreak()) 33 34 # 执行摘要 35 story.append(Paragraph("执行摘要", styles['Heading1'])) 36 story.append(Spacer(1, 12)) 37 story.append(Paragraph(summary, styles['Normal'])) 38 story.append(Spacer(1, 24)) 39 40 # 关键发现 41 story.append(Paragraph("关键发现", styles['Heading1'])) 42 story.append(Spacer(1, 12)) 43 for i, finding in enumerate(findings, 1): 44 story.append(Paragraph(f"{i}. {finding}", styles['Normal'])) 45 story.append(Spacer(1, 6)) 46 story.append(Spacer(1, 24)) 47 48 # 图表 49 if charts: 50 story.append(Paragraph("数据可视化", styles['Heading1'])) 51 story.append(Spacer(1, 12)) 52 for chart_path in charts: 53 try: 54 img = Image(chart_path, width=5*inch) 55 story.append(img) 56 story.append(Spacer(1, 12)) 57 except Exception as e: 58 story.append(Paragraph(f"图表加载失败: {chart_path}", styles['Normal'])) 59 60 # 数据表格 61 if data_table: 62 story.append(Paragraph("详细数据", styles['Heading1'])) 63 story.append(Spacer(1, 12)) 64 table = Table(data_table) 65 table.setStyle(TableStyle([ 66 ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#4472C4')), 67 ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), 68 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), 69 ('FONTSIZE', (0, 0), (-1, -1), 10), 70 ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), 71 ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#E7E6E6')]), 72 ])) 73 story.append(table) 74 75 # 生成 PDF 76 doc.build(story) 77 return output_path 78 79# 使用示例 80create_research_report( 81 output_path="files/reports/research_report.pdf", 82 title="市场研究报告", 83 summary="本报告分析了...", 84 findings=["发现1", "发现2", "发现3"], 85 charts=["files/charts/chart1.png", "files/charts/chart2.png"], 86 data_table=[ 87 ["指标", "2023", "2024", "增长率"], 88 ["市场规模", "100亿", "120亿", "20%"], 89 ["用户数", "1000万", "1500万", "50%"], 90 ] 91)
注意事项
-
中文支持:reportlab 默认不支持中文,需要注册中文字体:
python1from reportlab.pdfbase import pdfmetrics 2from reportlab.pdfbase.ttfonts import TTFont 3pdfmetrics.registerFont(TTFont('SimHei', 'SimHei.ttf')) -
图片格式:支持 PNG, JPG, GIF 格式
-
内存优化:处理大文件时使用流式处理