Python API
Installation
pip install anyparse-python
# or
pip install -e .
Sync API
from anyparse import AnyParser
model = AnyParser(config="config/config.yaml")
res = model.invoke(file = "/path/to/your_file")
Async API
from anyparse import AsyncAnyParser
model = AsyncAnyParser(config="config/config.yaml")
res = await model.ainvoke(file = "/path/to/your_file")
config.yaml
product_name: "anyparse"
model_id: "${product_name}"
modelapi:
prefix: "/${product_name}"
host: ${oc.env:api_host, "0.0.0.0"}
port: ${oc.env:api_port, "18007"}
path:
# restful api
filetypes: "/filetypes/v1"
invoke: "/invoke/v1"
# openai proxy
openai_model_list: "/openai/v1/models"
openai_create_file: "/openai/v1/files"
openai_retrieve_file: "/openai/v1/files"
openai_content_file: "/openai/v1/files"
openai_delete_file: "/openai/v1/files"
openai_chat_completions: "/openai/v1/chat/completions"
openai_responses: "/openai/v1/responses"
auths:
# api key
secret_key: ${oc.env:api_secret_key, "sk_6c5aa04e523de79518620095a0e2f7bf"}
filetypes:
text: ['txt']
html: ['htm','html','xhtml','shtml']
markdown: ['md','rst']
image: ['png', 'jpeg', 'jpg']
pdf: ['pdf']
epub: ['epub']
office: ['docx','pptx','xlsx']
csv: ['tsv','csv']
ipynb: ['ipynb']
email: ['eml']
mimetypes:
txt: "data:text/plain;base64,"
htm: "data:text/html;base64,"
html: "data:text/html;base64,"
xhtml: "data:text/html;base64,"
shtml: "data:text/html;base64,"
md: "data:text/markdown;base64,"
rst: "data:text/rst;base64,"
png: "data:image/png;base64,"
jpeg: "data:image/jpeg;base64,"
jpg: "data:image/jpeg;base64,"
webp: "data:image/webp;base64,"
pdf: "data:application/pdf;base64,"
epub: "data:application/epub;base64,"
docx: "data:application/docx;base64,"
pptx: "data:application/pptx;base64,"
xlsx: "data:application/xlsx;base64,"
tsv: "data:text/csv;base64,"
csv: "data:text/csv;base64,"
ipynb: "data:application/ipynb;base64,"
eml: "data:message/eml;base64,"
wav: "data:audio/wav;base64,"
mp3: "data:audio/mpeg;base64,"
aac: "data:audio/aac;base64,"
flac: "data:audio/flac;base64,"
mp4: "data:video/mp4;base64,"
mov: "data:video/mov;base64,"
mkv: "data:video/mkv;base64,"
webm: "data:video/webm;base64,"
avi: "data:video/avi;base64,"
anyparse:
# cache to save file
cache_dir: "~/.cache/anyparse"
# cal file md5
autocal_md5: false
# detect encoding
autodetect_encoding: true
# image and pdf
## use doc ori cls
use_doc_cls: false
## use doc rectifier
use_doc_rectifier: false
## use doc layout
use_doc_layout: true
## doc layout min size
doc_layout_image_min_size: 500
## use image resize
use_image_resize: false
## image batch size
image_batch_size: 1
## ocr batch size
ocr_batch_size: 1
# text encoding
text_encoding: "utf-8"
# csv tsv chunk size
table_chunk_size: null
# pdf page to image dpi
dpi: 200
# parse verbose
verbose: true
# parse stream
stream: false
### docx
docx_extract_headers_footers: true
docx_extract_images: true
### pptx
pptx_extract_images: true
### xlsx
excel_max_rows: null # unused
excel_extract_images: true
### doc ori cls see paddleocr pp_lcnet_x1_0_doc_ori
doc_cls:
batch_size: 1
model_path: "~/.cache/anyparse/models/pp_lcnet_x1_0_doc_ori"
dtype: "auto"
device_map: "auto"
## uvdoc see paddleocr uvdoc
doc_rectifier:
batch_size: 1
model_path: "~/.cache/anyparse/models/pp_uvdoc"
dtype: "auto"
device_map: "auto"
### doc layout see paddleocr ppdoclayout-v3
layout:
model_path: "~/.cache/anyparse/models/ppdoclayout-v3"
threshold: 0.3
batch_size: 1
dtype: "auto"
device_map: "auto"
layout_nms: true
vlm:
model_type: "paddleocrvl" # paddleocr, glmocr_v1, paddleocrvl, vllm
paddleocr:
model_class: "PaddleOCRClient.PPOCRV6"
model_path: "~/.cache/anyparse/models/paddleocrv6-small"
batch_size: 6
dtype: 'auto'
device_map: 'auto'
glmocr_v1:
model_class: "GlmOCRClient.GLMOCRV1"
model_path: "~/.cache/anyparse/models/glmocr-v1"
batch_size: 1
max_new_tokens: 8192
dtype: "auto"
device_map: "auto"
min_pixels: 12544 # 112 * 112
max_pixels: 71372800 # 14 * 14 * 4 * 1280
paddleocrvl:
model_class: "PaddleOCRVLClient.PPOCRVLClient"
model_path: "~/.cache/anyparse/models/paddleocrvl-v1.6"
batch_size: 1,
max_new_tokens: 16384
dtype: "auto"
device_map: "auto"
attn_implementation: null
truncate_content: true
truncate_content_list: [5000, 50]
vllm: # paddlocrvl-1.6 openai server
model_class: "OpenAIClient.OpenAIClient"
base_url: "http://localhost:18003/v1"
api_key: "sk-123456"
model: "PaddleOCR-VL-1.6"
stream: false
timeout: 1800.0
max_retries: 2
batch_size: 8
max_new_tokens: 8192
# paddleocrvl prompt mapping
task_prompt_map:
abstract: "OCR:"
algorithm: "OCR:"
content: "OCR:"
doc_title: "OCR:"
figure_title: "OCR:"
paragraph_title: "OCR:"
reference_content: "OCR:"
text: "OCR:"
vertical_text: "OCR:"
vision_footnote: "OCR:"
seal: "OCR:"
formula_number: "OCR:"
header: "OCR:"
footer: "OCR:"
number: "OCR:"
footnote: "OCR:"
aside_text: "OCR:"
reference: "OCR:"
footer_image: "OCR:"
header_image: "OCR:"
image: "OCR:"
table: "Table Recognition:"
display_formula: "Formula Recognition:"
inline_formula: "Formula Recognition:"
chart: "Chart Recognition:"
client_args:
call_args:
prompt_template: >-
[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "{{ data_url }}"
}
},
{
"type": "text",
"text": "{{ prompt }}"
}
]
}
]
# converters_models:
# - name: "markdown"
# model_class: "mkd.MkdConverter"
logger: # logoru config
type: "loguru"
filename: ./logs/${product_name}.log
level: "DEBUG"
loguru:
encoding: "utf-8"
mode: "a+"
rotation: "00:00"
retention: "30 days"
colorize: false
enqueue: true
backtrace: true
diagnose: true
compression: null
strformat: "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{process}</cyan>:<cyan>{thread}</cyan> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | <level>{message}</level>"
notice
${oc.env:api_host, "0.0.0.0"} like omegaconf can be modified by environment variables: export api_host=0.0.0.0 or source config/config.env
invoke and ainvoke params
realtime args for invoke and ainvoke,like:
model.invoke(
file = "/path/to/your_file",
autocal_md5 = True,
...
)
- autocal_md5: Whether to automatically calculate MD5. Default:
True. - autodetect_encoding: Whether to automatically detect encoding. Default:
True. - use_doc_cls: Whether to perform document orientation correction. Default:
False. - use_doc_rectifier: Whether to perform document rectification. Default:
False. - use_doc_layout: Whether to perform document layout recognition. Default:
True. - doc_layout_image_min_size: Minimum image size for document layout recognition. Default:
400. - use_image_resize: Whether to resize images. Default:
False. - text_encoding: Text encoding format. Default:
"utf-8". - table_chunk_size: Table chunk size. Default:
None. - table_custom_separator: Custom separator for tables. Default:
None. - table_sniffer: Whether to automatically detect table separators. Default:
True. - dpi: Image resolution in DPI. Default:
200. - verbose: Whether to enable detailed output. Default:
True. - stream: Whether to use stream processing. Default:
False. - docx_extract_headers_footers: Whether to extract headers and footers from DOCX files. Default:
True. - docx_extract_images: Whether to extract images from DOCX files. Default:
True. - pptx_extract_images: Whether to extract images from PPTX files. Default:
True. - excel_extract_images: Whether to extract images from Excel files. Default:
True. - excel_max_rows: Maximum number of rows to extract from Excel files. Default:
None. - max_new_tokens: Maximum generation length. Default:
8192. - image_batch_size: Batch size for image processing. Default:
1. - ocr_batch_size: Batch size for text (OCR) processing. Default:
1.