add:markdown 转 pdf
This commit is contained in:
425
docling/PDF_API_USAGE.md
Normal file
425
docling/PDF_API_USAGE.md
Normal file
@@ -0,0 +1,425 @@
|
||||
# Word/Markdown 转 PDF API 使用指南
|
||||
|
||||
## API 端点
|
||||
|
||||
```
|
||||
POST /api/pdf/convert
|
||||
```
|
||||
|
||||
## 支持的输入方式
|
||||
|
||||
### 1. 上传文件
|
||||
|
||||
支持上传 `.doc`, `.docx`, `.md` 文件:
|
||||
|
||||
```javascript
|
||||
// 上传 Word 文件
|
||||
const formData = new FormData();
|
||||
formData.append('file', fileInput.files[0]); // .doc 或 .docx
|
||||
|
||||
// 可选参数
|
||||
formData.append('toc', 'true'); // 生成目录
|
||||
formData.append('header_text', '文档标题|页码'); // 页眉
|
||||
formData.append('footer_text', '版权信息'); // 页脚
|
||||
formData.append('filename_text', '我的文档'); // 文件名
|
||||
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
const blob = await response.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
```
|
||||
|
||||
### 2. 指定本地文件路径
|
||||
|
||||
```javascript
|
||||
const formData = new FormData();
|
||||
formData.append('file_path', '/path/to/document.docx');
|
||||
formData.append('toc', 'true');
|
||||
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
```
|
||||
|
||||
### 3. 直接提交 Markdown 内容
|
||||
|
||||
```javascript
|
||||
const formData = new FormData();
|
||||
formData.append('markdown_content', '# 标题\n\n这是内容');
|
||||
formData.append('filename_text', '我的文档');
|
||||
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
```
|
||||
|
||||
## 完整参数列表
|
||||
|
||||
| 参数 | 类型 | 必填 | 说明 |
|
||||
|------|------|------|------|
|
||||
| file | File | 否* | 上传的文件 |
|
||||
| file_path | string | 否* | 本地文件路径 |
|
||||
| markdown_content | string | 否* | Markdown 内容 |
|
||||
| toc | boolean | 否 | 是否生成目录,默认 false |
|
||||
| header_text | string | 否 | 页眉文本,可用 `\|` 分隔左右 |
|
||||
| footer_text | string | 否 | 页脚文本 |
|
||||
| logo_url | string | 否 | Logo 图片 URL |
|
||||
| copyright_text | string | 否 | 版权声明 |
|
||||
| filename_text | string | 否 | 显示的文件名 |
|
||||
| cover_src | string | 否 | 封面图片 URL |
|
||||
| product_name | string | 否 | 产品名称(封面) |
|
||||
| document_name | string | 否 | 文档名称(封面) |
|
||||
| product_version | string | 否 | 产品版本 |
|
||||
| document_version | string | 否 | 文档版本 |
|
||||
| css_name | string | 否 | CSS 样式名称 |
|
||||
| css_text | string | 否 | 自定义 CSS |
|
||||
| download | boolean | 否 | 是否直接下载,默认 true |
|
||||
|
||||
*注:file、file_path、markdown_content 三者必选其一
|
||||
|
||||
## 完整示例代码
|
||||
|
||||
### React 示例
|
||||
|
||||
```jsx
|
||||
import { useState, useRef } from 'react';
|
||||
|
||||
function PdfConverter() {
|
||||
const [loading, setLoading] = useState(false);
|
||||
const fileInput = useRef(null);
|
||||
|
||||
const convertToPdf = async () => {
|
||||
const file = fileInput.current.files[0];
|
||||
if (!file) return;
|
||||
|
||||
setLoading(true);
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
formData.append('toc', 'true');
|
||||
formData.append('header_text', '我的文档|第 {page} 页');
|
||||
formData.append('footer_text', '© 2024 公司名称');
|
||||
formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '转换失败');
|
||||
}
|
||||
|
||||
const blob = await response.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
// 下载 PDF
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
} catch (error) {
|
||||
console.error('转换失败:', error);
|
||||
alert('转换失败: ' + error.message);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div>
|
||||
<input
|
||||
type="file"
|
||||
ref={fileInput}
|
||||
accept=".doc,.docx,.md"
|
||||
/>
|
||||
<button
|
||||
onClick={convertToPdf}
|
||||
disabled={loading}
|
||||
>
|
||||
{loading ? '转换中...' : '转换为 PDF'}
|
||||
</button>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
### Vue 3 示例
|
||||
|
||||
```vue
|
||||
<template>
|
||||
<div>
|
||||
<input
|
||||
ref="fileInput"
|
||||
type="file"
|
||||
accept=".doc,.docx,.md"
|
||||
/>
|
||||
<button
|
||||
@click="convertToPdf"
|
||||
:disabled="loading"
|
||||
>
|
||||
{{ loading ? '转换中...' : '转换为 PDF' }}
|
||||
</button>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref } from 'vue';
|
||||
|
||||
const fileInput = ref(null);
|
||||
const loading = ref(false);
|
||||
|
||||
const convertToPdf = async () => {
|
||||
const file = fileInput.value.files[0];
|
||||
if (!file) return;
|
||||
|
||||
loading.value = true;
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
formData.append('toc', 'true');
|
||||
formData.append('header_text', '我的文档');
|
||||
formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '转换失败');
|
||||
}
|
||||
|
||||
const blob = await response.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
} catch (error) {
|
||||
console.error('转换失败:', error);
|
||||
alert('转换失败: ' + error.message);
|
||||
} finally {
|
||||
loading.value = false;
|
||||
}
|
||||
};
|
||||
</script>
|
||||
```
|
||||
|
||||
### 原生 JavaScript 示例
|
||||
|
||||
```html
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Word/Markdown 转 PDF</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>文档转 PDF</h1>
|
||||
|
||||
<input type="file" id="fileInput" accept=".doc,.docx,.md">
|
||||
<button id="convertBtn">转换为 PDF</button>
|
||||
|
||||
<div id="status" style="margin-top: 10px;"></div>
|
||||
|
||||
<script>
|
||||
document.getElementById('convertBtn').addEventListener('click', async () => {
|
||||
const fileInput = document.getElementById('fileInput');
|
||||
const status = document.getElementById('status');
|
||||
const file = fileInput.files[0];
|
||||
|
||||
if (!file) {
|
||||
status.textContent = '请选择文件';
|
||||
return;
|
||||
}
|
||||
|
||||
status.textContent = '转换中...';
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
formData.append('toc', 'true');
|
||||
formData.append('header_text', '我的文档|{page}');
|
||||
formData.append('footer_text', '© 2024');
|
||||
formData.append('filename_text', file.name.replace(/\.[^/.]+$/, ''));
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '转换失败');
|
||||
}
|
||||
|
||||
const blob = await response.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = file.name.replace(/\.[^/.]+$/, '') + '.pdf';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
|
||||
status.textContent = '转换成功!';
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
status.textContent = '转换失败: ' + error.message;
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
```
|
||||
|
||||
### Markdown 内容转 PDF 示例
|
||||
|
||||
```javascript
|
||||
async function markdownToPdf() {
|
||||
const markdownContent = `
|
||||
# 我的文档
|
||||
|
||||
## 第一章
|
||||
|
||||
这是第一章的内容。
|
||||
|
||||
### 小节
|
||||
|
||||
- 列表项 1
|
||||
- 列表项 2
|
||||
|
||||
| 列1 | 列2 |
|
||||
|-----|-----|
|
||||
| A | B |
|
||||
`;
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('markdown_content', markdownContent);
|
||||
formData.append('filename_text', '我的Markdown文档');
|
||||
formData.append('toc', 'true');
|
||||
formData.append('header_text', 'Markdown文档');
|
||||
formData.append('footer_text', '© 2024');
|
||||
|
||||
const response = await fetch('/api/pdf/convert', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
const blob = await response.blob();
|
||||
// 保存 PDF
|
||||
saveAs(blob, 'document.pdf');
|
||||
}
|
||||
```
|
||||
|
||||
## Python 调用示例
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def convert_word_to_pdf(file_path, output_path):
|
||||
"""将 Word 文件转换为 PDF"""
|
||||
with open(file_path, 'rb') as f:
|
||||
files = {'file': f}
|
||||
data = {
|
||||
'toc': 'true',
|
||||
'header_text': '我的文档',
|
||||
'footer_text': '© 2024',
|
||||
'filename_text': '文档名称'
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
'http://localhost:8000/api/pdf/convert',
|
||||
files=files,
|
||||
data=data
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
with open(output_path, 'wb') as out:
|
||||
out.write(response.content)
|
||||
print(f"PDF 已保存到: {output_path}")
|
||||
else:
|
||||
print(f"转换失败: {response.text}")
|
||||
|
||||
# 使用示例
|
||||
convert_word_to_pdf('document.docx', 'output.pdf')
|
||||
```
|
||||
|
||||
## cURL 示例
|
||||
|
||||
```bash
|
||||
# 上传 Word 文件转 PDF
|
||||
curl -X POST http://localhost:8000/api/pdf/convert \
|
||||
-F "file=@document.docx" \
|
||||
-F "toc=true" \
|
||||
-F "header_text=我的文档" \
|
||||
-F "footer_text=© 2024" \
|
||||
-o output.pdf
|
||||
|
||||
# Markdown 内容转 PDF
|
||||
curl -X POST http://localhost:8000/api/pdf/convert \
|
||||
-F "markdown_content=# 标题\n\n这是内容" \
|
||||
-F "filename_text=文档" \
|
||||
-o output.pdf
|
||||
```
|
||||
|
||||
## 错误处理
|
||||
|
||||
API 返回的错误格式:
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "错误信息"
|
||||
}
|
||||
```
|
||||
|
||||
常见错误:
|
||||
|
||||
| 错误信息 | 原因 | 解决方法 |
|
||||
|---------|------|----------|
|
||||
| 必须提供 file、file_path 或 markdown_content 中的一个 | 未提供输入 | 检查请求参数 |
|
||||
| 不支持的文件格式 | 文件格式错误 | 确保是 .doc/.docx/.md |
|
||||
| 文件不存在 | 本地文件路径无效 | 检查 file_path 参数 |
|
||||
| PDF 转换失败 | 转换过程出错 | 查看服务器日志 |
|
||||
|
||||
## 返回格式
|
||||
|
||||
### download=true (默认)
|
||||
|
||||
直接返回 PDF 文件流:
|
||||
|
||||
```
|
||||
Content-Type: application/pdf
|
||||
Content-Disposition: attachment; filename="document.pdf"
|
||||
```
|
||||
|
||||
### download=false
|
||||
|
||||
返回 JSON,包含 base64 编码的 PDF:
|
||||
|
||||
```json
|
||||
{
|
||||
"ok": true,
|
||||
"pdf_base64": "JVBERi0xLjQK...",
|
||||
"filename": "document.pdf",
|
||||
"size": 12345
|
||||
}
|
||||
```
|
||||
@@ -58,6 +58,12 @@ from app.services.docling_adapter import (
|
||||
)
|
||||
from app.services.unified_converter import FormatConverter
|
||||
from app.services.minio_utils import minio_current, join_prefix, presigned_read
|
||||
from app.services.pdf_converter import (
|
||||
word_to_pdf_bytes,
|
||||
markdown_to_pdf_bytes,
|
||||
markdown_file_to_pdf_bytes,
|
||||
read_file_content,
|
||||
)
|
||||
|
||||
"""
|
||||
@api Server Application
|
||||
@@ -2561,6 +2567,193 @@ async def api_convert(
|
||||
except Exception as e:
|
||||
return _err(str(e))
|
||||
|
||||
@app.post("/api/pdf/convert")
|
||||
async def api_pdf_convert(
|
||||
file: Optional[UploadFile] = File(None),
|
||||
file_path: Optional[str] = Form(None),
|
||||
markdown_content: Optional[str] = Form(None),
|
||||
toc: bool = Form(False),
|
||||
header_text: Optional[str] = Form(None),
|
||||
footer_text: Optional[str] = Form(None),
|
||||
logo_url: Optional[str] = Form(None),
|
||||
copyright_text: Optional[str] = Form(None),
|
||||
filename_text: Optional[str] = Form(None),
|
||||
cover_src: Optional[str] = Form(None),
|
||||
product_name: Optional[str] = Form(None),
|
||||
document_name: Optional[str] = Form(None),
|
||||
product_version: Optional[str] = Form(None),
|
||||
document_version: Optional[str] = Form(None),
|
||||
css_name: Optional[str] = Form(None),
|
||||
css_text: Optional[str] = Form(None),
|
||||
download: bool = Form(True),
|
||||
):
|
||||
"""
|
||||
Convert Word or Markdown to PDF
|
||||
|
||||
Supports three input methods:
|
||||
1. Upload file (Word .doc/.docx or Markdown .md)
|
||||
2. Specify file_path (local file path)
|
||||
3. Provide markdown_content directly
|
||||
|
||||
Returns PDF file as download by default
|
||||
"""
|
||||
try:
|
||||
pdf_bytes: bytes = b""
|
||||
output_filename: str = "document.pdf"
|
||||
|
||||
# Determine input source
|
||||
if file:
|
||||
# Handle uploaded file
|
||||
filename = file.filename or "upload"
|
||||
suffix = Path(filename).suffix.lower()
|
||||
|
||||
# Save uploaded file to temp
|
||||
tmp_path = Path(tempfile.mktemp(suffix=suffix))
|
||||
try:
|
||||
content = await file.read()
|
||||
tmp_path.write_bytes(content)
|
||||
|
||||
if suffix in {".doc", ".docx"}:
|
||||
# Convert Word to PDF
|
||||
output_filename = f"{Path(filename).stem}.pdf"
|
||||
pdf_bytes = await asyncio.to_thread(
|
||||
word_to_pdf_bytes,
|
||||
tmp_path,
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text or Path(filename).stem,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
)
|
||||
elif suffix in {".md", ".markdown"}:
|
||||
# Convert Markdown file to PDF
|
||||
output_filename = f"{Path(filename).stem}.pdf"
|
||||
pdf_bytes = await asyncio.to_thread(
|
||||
markdown_file_to_pdf_bytes,
|
||||
tmp_path,
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text or Path(filename).stem,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
css_name=css_name,
|
||||
css_text=css_text,
|
||||
)
|
||||
else:
|
||||
return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
|
||||
finally:
|
||||
try:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
elif file_path:
|
||||
# Handle local file path
|
||||
path = Path(file_path).expanduser()
|
||||
if not path.exists():
|
||||
return _err(f"文件不存在: {file_path}")
|
||||
|
||||
suffix = path.suffix.lower()
|
||||
output_filename = f"{path.stem}.pdf"
|
||||
|
||||
if suffix in {".doc", ".docx"}:
|
||||
pdf_bytes = await asyncio.to_thread(
|
||||
word_to_pdf_bytes,
|
||||
path,
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text or path.stem,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
)
|
||||
elif suffix in {".md", ".markdown"}:
|
||||
pdf_bytes = await asyncio.to_thread(
|
||||
markdown_file_to_pdf_bytes,
|
||||
path,
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text or path.stem,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
css_name=css_name,
|
||||
css_text=css_text,
|
||||
)
|
||||
else:
|
||||
return _err(f"不支持的文件格式: {suffix}。支持的格式: .doc, .docx, .md")
|
||||
|
||||
elif markdown_content:
|
||||
# Handle direct markdown content
|
||||
output_filename = f"{filename_text or 'document'}.pdf"
|
||||
pdf_bytes = await asyncio.to_thread(
|
||||
markdown_to_pdf_bytes,
|
||||
markdown_content,
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
css_name=css_name,
|
||||
css_text=css_text,
|
||||
)
|
||||
else:
|
||||
return _err("必须提供 file、file_path 或 markdown_content 中的一个")
|
||||
|
||||
if not pdf_bytes:
|
||||
return _err("PDF 转换失败,未生成内容")
|
||||
|
||||
# Return PDF file
|
||||
if download:
|
||||
from fastapi.responses import StreamingResponse
|
||||
return StreamingResponse(
|
||||
io.BytesIO(pdf_bytes),
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename=\"{output_filename}\""
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Return as base64 in JSON
|
||||
import base64
|
||||
return _ok({
|
||||
"pdf_base64": base64.b64encode(pdf_bytes).decode("ascii"),
|
||||
"filename": output_filename,
|
||||
"size": len(pdf_bytes)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logging.exception("PDF conversion error")
|
||||
return _err(f"PDF 转换失败: {str(e)}")
|
||||
|
||||
@app.post("/api/import/convert")
|
||||
async def api_import_convert(json_file: UploadFile = File(None), json_text: Optional[str] = Form(None), path: Optional[str] = Form(None), versionId: Optional[int] = Form(1001), download: Optional[bool] = Form(False)):
|
||||
try:
|
||||
|
||||
198
docling/app/services/pdf_converter.py
Normal file
198
docling/app/services/pdf_converter.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
PDF Conversion Service
|
||||
Provides Word-to-PDF and Markdown-to-PDF conversion functionality
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Tuple
|
||||
import io
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
from docx import Document as DocxDocument
|
||||
|
||||
from app.services.word2markdown import convert_any as convert_word_to_md
|
||||
from app.services.docling_adapter import md_to_pdf_bytes_with_renderer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def word_to_pdf_bytes(
|
||||
file_path: str | Path,
|
||||
toc: bool = False,
|
||||
header_text: Optional[str] = None,
|
||||
footer_text: Optional[str] = None,
|
||||
logo_url: Optional[str] = None,
|
||||
copyright_text: Optional[str] = None,
|
||||
filename_text: Optional[str] = None,
|
||||
cover_src: Optional[str] = None,
|
||||
product_name: Optional[str] = None,
|
||||
document_name: Optional[str] = None,
|
||||
product_version: Optional[str] = None,
|
||||
document_version: Optional[str] = None,
|
||||
) -> bytes:
|
||||
"""
|
||||
Convert Word document (.doc, .docx) to PDF
|
||||
|
||||
Args:
|
||||
file_path: Path to Word file
|
||||
toc: Enable table of contents
|
||||
header_text: Custom header text
|
||||
footer_text: Custom footer text
|
||||
logo_url: URL to logo image
|
||||
copyright_text: Copyright notice
|
||||
filename_text: Filename to display
|
||||
cover_src: Cover image source
|
||||
product_name: Product name for cover
|
||||
document_name: Document name for cover
|
||||
product_version: Product version
|
||||
document_version: Document version
|
||||
|
||||
Returns:
|
||||
PDF file as bytes
|
||||
"""
|
||||
logger.info(f"Converting Word to PDF: {file_path}")
|
||||
|
||||
# Convert Word to Markdown first
|
||||
path = Path(file_path)
|
||||
_, markdown_content = convert_word_to_md(path)
|
||||
|
||||
# Then convert Markdown to PDF
|
||||
pdf_bytes = md_to_pdf_bytes_with_renderer(
|
||||
md=markdown_content,
|
||||
renderer="weasyprint",
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text or path.stem,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
)
|
||||
|
||||
logger.info(f"Word to PDF conversion complete: {len(pdf_bytes)} bytes")
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def markdown_to_pdf_bytes(
|
||||
markdown_content: str,
|
||||
toc: bool = False,
|
||||
header_text: Optional[str] = None,
|
||||
footer_text: Optional[str] = None,
|
||||
logo_url: Optional[str] = None,
|
||||
copyright_text: Optional[str] = None,
|
||||
filename_text: Optional[str] = None,
|
||||
cover_src: Optional[str] = None,
|
||||
product_name: Optional[str] = None,
|
||||
document_name: Optional[str] = None,
|
||||
product_version: Optional[str] = None,
|
||||
document_version: Optional[str] = None,
|
||||
css_name: Optional[str] = None,
|
||||
css_text: Optional[str] = None,
|
||||
) -> bytes:
|
||||
"""
|
||||
Convert Markdown content to PDF
|
||||
|
||||
Args:
|
||||
markdown_content: Markdown text content
|
||||
toc: Enable table of contents
|
||||
header_text: Custom header text
|
||||
footer_text: Custom footer text
|
||||
logo_url: URL to logo image
|
||||
copyright_text: Copyright notice
|
||||
filename_text: Filename to display
|
||||
cover_src: Cover image source
|
||||
product_name: Product name for cover
|
||||
document_name: Document name for cover
|
||||
product_version: Product version
|
||||
document_version: Document version
|
||||
css_name: Name of CSS file in configs/styles
|
||||
css_text: Custom CSS as string
|
||||
|
||||
Returns:
|
||||
PDF file as bytes
|
||||
"""
|
||||
logger.info("Converting Markdown to PDF")
|
||||
|
||||
pdf_bytes = md_to_pdf_bytes_with_renderer(
|
||||
md=markdown_content,
|
||||
renderer="weasyprint",
|
||||
css_name=css_name,
|
||||
css_text=css_text,
|
||||
toc=toc,
|
||||
header_text=header_text,
|
||||
footer_text=footer_text,
|
||||
logo_url=logo_url,
|
||||
copyright_text=copyright_text,
|
||||
filename_text=filename_text,
|
||||
cover_src=cover_src,
|
||||
product_name=product_name,
|
||||
document_name=document_name,
|
||||
product_version=product_version,
|
||||
document_version=document_version,
|
||||
)
|
||||
|
||||
logger.info(f"Markdown to PDF conversion complete: {len(pdf_bytes)} bytes")
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def markdown_file_to_pdf_bytes(
|
||||
file_path: str | Path,
|
||||
encoding: str = "utf-8",
|
||||
**kwargs
|
||||
) -> bytes:
|
||||
"""
|
||||
Convert Markdown file to PDF
|
||||
|
||||
Args:
|
||||
file_path: Path to Markdown file
|
||||
encoding: File encoding (default: utf-8)
|
||||
**kwargs: Additional arguments passed to markdown_to_pdf_bytes
|
||||
|
||||
Returns:
|
||||
PDF file as bytes
|
||||
"""
|
||||
path = Path(file_path)
|
||||
markdown_content = path.read_text(encoding=encoding)
|
||||
|
||||
# Set default filename from file path if not provided
|
||||
if "filename_text" not in kwargs or not kwargs.get("filename_text"):
|
||||
kwargs["filename_text"] = path.stem
|
||||
|
||||
return markdown_to_pdf_bytes(markdown_content, **kwargs)
|
||||
|
||||
|
||||
def read_file_content(file_path: str | Path) -> Tuple[str, bytes]:
|
||||
"""
|
||||
Read file content and detect content type
|
||||
|
||||
Args:
|
||||
file_path: Path to file
|
||||
|
||||
Returns:
|
||||
Tuple of (detected_type, content_bytes)
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
content_bytes = path.read_bytes()
|
||||
|
||||
# Detect by extension
|
||||
ext = path.suffix.lower()
|
||||
if ext in {".md", ".markdown"}:
|
||||
return "markdown", content_bytes
|
||||
elif ext in {".doc", ".docx"}:
|
||||
return "word", content_bytes
|
||||
elif ext in {".txt"}:
|
||||
return "text", content_bytes
|
||||
else:
|
||||
# Try to detect by content
|
||||
content_start = content_bytes[:8]
|
||||
if content_start.startswith(b"PK\x03\x04"):
|
||||
return "word", content_bytes
|
||||
return "text", content_bytes
|
||||
Reference in New Issue
Block a user