Python API

安装

pip install anyparse-python

# 或者

pip install -e .

同步 API

from anyparse import AnyParser

model = AnyParser(config="config/config.yaml")
res = model.invoke(file = "/path/to/your_file")

异步 API

from anyparse import AsyncAnyParser

model = AsyncAnyParser(config="config/config.yaml")
res = await model.ainvoke(file = "/path/to/your_file")

config.yaml

product_name: "anyparse"

model_id: "${product_name}"

modelapi:
    prefix: "/${product_name}"
    host: ${oc.env:api_host, "0.0.0.0"}
    port: ${oc.env:api_port, "18007"}
    path:
        # restful api
        filetypes: "/filetypes/v1"
        invoke: "/invoke/v1"
        # openai proxy
        openai_model_list: "/openai/v1/models"
        openai_create_file: "/openai/v1/files"
        openai_retrieve_file: "/openai/v1/files"
        openai_content_file: "/openai/v1/files"
        openai_delete_file: "/openai/v1/files"
        openai_chat_completions: "/openai/v1/chat/completions"
        openai_responses: "/openai/v1/responses"

auths:
    # api key
    secret_key: ${oc.env:api_secret_key, "sk_6c5...f7bf"}

filetypes: 
    text: ['txt']
    html: ['htm','html','xhtml','shtml']
    markdown: ['md','rst']
    image: ['png', 'jpeg', 'jpg']
    pdf: ['pdf']
    epub: ['epub']
    office: ['docx','pptx','xlsx']
    csv: ['tsv','csv']
    ipynb: ['ipynb']
    email: ['eml']

mimetypes:
    txt: "data:text/plain;base64,"
    htm: "data:text/html;base64,"
    html: "data:text/html;base64,"
    xhtml: "data:text/html;base64,"
    shtml: "data:text/html;base64,"
    md: "data:text/markdown;base64,"
    rst: "data:text/rst;base64,"
    png: "data:image/png;base64,"
    jpeg: "data:image/jpeg;base64,"
    jpg: "data:image/jpeg;base64,"
    webp: "data:image/webp;base64,"
    pdf: "data:application/pdf;base64,"
    epub: "data:application/epub;base64,"
    docx: "data:application/docx;base64,"
    pptx: "data:application/pptx;base64,"
    xlsx: "data:application/xlsx;base64,"
    tsv: "data:text/csv;base64,"
    csv: "data:text/csv;base64,"
    ipynb: "data:application/ipynb;base64,"
    eml: "data:message/eml;base64,"
    wav: "data:audio/wav;base64,"
    mp3: "data:audio/mpeg;base64,"
    aac: "data:audio/aac;base64,"
    flac: "data:audio/flac;base64,"
    mp4: "data:video/mp4;base64,"
    mov: "data:video/mov;base64,"
    mkv: "data:video/mkv;base64,"
    webm: "data:video/webm;base64,"
    avi: "data:video/avi;base64,"

anyparse:
    # 缓存文件保存目录
    cache_dir: "~/.cache/anyparse"
    # 计算文件 MD5
    autocal_md5: false
    # 检测编码
    autodetect_encoding: true
    # 图片和 PDF
    ## 使用文档方向分类
    use_doc_cls: false
    ## 使用文档矫正
    use_doc_rectifier: false
    ## 使用文档布局
    use_doc_layout: true
    ## 文档布局识别的最小图片尺寸
    doc_layout_image_min_size: 500
    ## 使用图片缩放
    use_image_resize: false
    ## 图片批处理大小
    image_batch_size: 1
    ## OCR 批处理大小
    ocr_batch_size: 1

    # 文本编码
    text_encoding: "utf-8"
    # csv tsv 分块大小
    table_chunk_size: null
    # PDF 页面转图片的 DPI
    dpi: 200
    # 解析详细输出
    verbose: true
    # 解析流式处理
    stream: false

    ### docx
    docx_extract_headers_footers: true
    docx_extract_images: true

    ### pptx
    pptx_extract_images: true

    ### xlsx
    excel_max_rows: null  # 未使用
    excel_extract_images: true

    ### 文档方向分类，请参考 paddleocr pp_lcnet_x1_0_doc_ori
    doc_cls:
        batch_size: 1
        model_path: "~/.cache/anyparse/models/pp_lcnet_x1_0_doc_ori"
        dtype: "auto"
        device_map: "auto"

    ## 文档矫正，请参考 paddleocr uvdoc
    doc_rectifier:
        batch_size: 1
        model_path: "~/.cache/anyparse/models/pp_uvdoc"
        dtype: "auto"
        device_map: "auto"

    ### 文档布局，请参考 paddleocr ppdoclayout-v3
    layout:
        model_path: "~/.cache/anyparse/models/ppdoclayout-v3"
        threshold: 0.3
        batch_size: 1
        dtype: "auto"
        device_map: "auto"
        layout_nms: true

    vlm:
        model_type: "paddleocrvl" # paddleocr, glmocr_v1, paddleocrvl, vllm

        paddleocr:
            model_class: "PaddleOCRClient.PPOCRV6"
            model_path: "~/.cache/anyparse/models/paddleocrv6-small"
            batch_size: 6
            dtype: 'auto'
            device_map: 'auto'

        glmocr_v1:
            model_class: "GlmOCRClient.GLMOCRV1"
            model_path: "~/.cache/anyparse/models/glmocr-v1"
            batch_size: 1
            max_new_tokens: 8192
            dtype: "auto"
            device_map: "auto"
            min_pixels: 12544 # 112 * 112
            max_pixels: 71372800 # 14 * 14 * 4 * 1280  

        paddleocrvl:
            model_class: "PaddleOCRVLClient.PPOCRVLClient"
            model_path: "~/.cache/anyparse/models/paddleocrvl-v1.6"
            batch_size: 1,
            max_new_tokens: 16384
            dtype: "auto"
            device_map: "auto"  
            attn_implementation: null  
            truncate_content: true
            truncate_content_list: [5000, 50] 

        vllm: # paddlocrvl-1.6 openai server
            model_class: "OpenAIClient.OpenAIClient"
            base_url: "http://localhost:18003/v1"
            api_key: "sk-123456"
            model: "PaddleOCR-VL-1.6"
            stream: false
            timeout: 1800.0
            max_retries: 2
            batch_size: 8
            max_new_tokens: 8192
            # paddleocrvl 提示词映射
            task_prompt_map:
                abstract: "OCR:"
                algorithm: "OCR:"
                content: "OCR:"
                doc_title: "OCR:"
                figure_title: "OCR:"
                paragraph_title: "OCR:"
                reference_content: "OCR:"
                text: "OCR:"
                vertical_text: "OCR:"
                vision_footnote: "OCR:"
                seal: "OCR:"
                formula_number: "OCR:"
                header: "OCR:"
                footer: "OCR:"
                number: "OCR:"
                footnote: "OCR:"
                aside_text: "OCR:"
                reference: "OCR:"
                footer_image: "OCR:"
                header_image: "OCR:"
                image: "OCR:"
                table: "Table Recognition:"
                display_formula: "Formula Recognition:"
                inline_formula: "Formula Recognition:"
                chart: "Chart Recognition:"
            client_args:
            call_args:
            prompt_template: >-
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url", 
                                "image_url": {
                                    "url": "{{ data_url }}"
                                }
                            },
                            {
                                "type": "text", 
                                "text": "{{ prompt }}"
                            }
                        ]
                    }
                ]

    # converters_models:
    #     - name: "markdown"
    #       model_class: "mkd.MkdConverter"

logger: # logoru 配置
  type: "loguru"
  filename: ./logs/${product_name}.log
  level: "DEBUG"
  loguru:
      encoding: "utf-8"
      mode: "a+"
      rotation: "00:00"
      retention: "30 days"
      colorize: false
      enqueue: true
      backtrace: true
      diagnose: true
      compression: null
      strformat: "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{process}</cyan>:<cyan>{thread}</cyan> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | <level>{message}</level>"

注意事项

${oc.env:api_host, "0.0.0.0"} 类似于 omegaconf 语法，可以通过环境变量修改：export api_host=0.0.0.0 或者 source config/config.env

invoke 和 ainvoke 参数

invoke 和 ainvoke 的实时参数，例如：

model.invoke(
    file = "/path/to/your_file",
    autocal_md5 = True,
    ...
)

autocal_md5: 是否自动计算 MD5。默认值：True。
autodetect_encoding: 是否自动检测编码。默认值：True。
use_doc_cls: 是否执行文档方向矫正。默认值：False。
use_doc_rectifier: 是否执行文档矫正。默认值：False。
use_doc_layout: 是否执行文档布局识别。默认值：True。
doc_layout_image_min_size: 文档布局识别的最小图片尺寸。默认值：400。
use_image_resize: 是否缩放图片。默认值：False。
text_encoding: 文本编码格式。默认值："utf-8"。
table_chunk_size: 表格分块大小。默认值：None。
table_custom_separator: 表格自定义分隔符。默认值：None。
table_sniffer: 是否自动检测表格分隔符。默认值：True。
dpi: 图片分辨率（DPI）。默认值：200。
verbose: 是否启用详细输出。默认值：True。
stream: 是否使用流式处理。默认值：False。
docx_extract_headers_footers: 是否从 DOCX 文件中提取页眉和页脚。默认值：True。
docx_extract_images: 是否从 DOCX 文件中提取图片。默认值：True。
pptx_extract_images: 是否从 PPTX 文件中提取图片。默认值：True。
excel_extract_images: 是否从 Excel 文件中提取图片。默认值：True。
excel_max_rows: 从 Excel 文件中提取的最大行数。默认值：None。
max_new_tokens: 最大生成长度。默认值：8192。
image_batch_size: 图片处理批大小。默认值：1。
ocr_batch_size: 文本（OCR）处理批大小。默认值：1。