Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
.git/
.github/
.venv/
venv/
env/
ENV/
.cache/
__pycache__/
*.py[cod]
*.log
.pytest_cache/
.mypy_cache/
.ruff_cache/
.ipynb_checkpoints/
outputs/
log/
assets/
*.pdf
build/
dist/
*.egg-info/
.eggs/
.DS_Store
Thumbs.db
.vscode/
.idea/
121 changes: 121 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
name: Docker

on:
push:
branches:
- main
tags:
- "v*.*.*"
paths:
- ".github/workflows/docker.yml"
- ".dockerignore"
- "Dockerfile"
- "docker-compose.yml"
- "infer.py"
- "requirements-sglang.txt"
- "wheel/**"
pull_request:
paths:
- ".github/workflows/docker.yml"
- ".dockerignore"
- "Dockerfile"
- "docker-compose.yml"
- "infer.py"
- "requirements-sglang.txt"
- "wheel/**"
workflow_dispatch:

permissions:
contents: read
packages: write

concurrency:
group: docker-${{ github.ref }}
cancel-in-progress: true

jobs:
build:
name: Build and publish Docker image
runs-on: ubuntu-latest
env:
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
steps:
- name: Checkout
uses: actions/checkout@v5

- name: Set image names
id: images
shell: bash
run: |
echo "ghcr=ghcr.io/${GITHUB_REPOSITORY,,}" >> "${GITHUB_OUTPUT}"
if [ -n "${DOCKERHUB_USERNAME}" ]; then
echo "dockerhub=${DOCKERHUB_USERNAME}/unlimited-ocr" >> "${GITHUB_OUTPUT}"
fi

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to GitHub Container Registry
if: ${{ github.event_name != 'pull_request' }}
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract GHCR metadata
id: meta-ghcr
uses: docker/metadata-action@v5
with:
images: ${{ steps.images.outputs.ghcr }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=ref,event=tag
type=sha,prefix=sha-
type=raw,value=latest,enable={{is_default_branch}}

- name: Build and optionally push GHCR image
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
platforms: linux/amd64
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta-ghcr.outputs.tags }}
labels: ${{ steps.meta-ghcr.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Log in to Docker Hub
if: ${{ github.event_name != 'pull_request' && env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }}
uses: docker/login-action@v3
with:
username: ${{ env.DOCKERHUB_USERNAME }}
password: ${{ env.DOCKERHUB_TOKEN }}

- name: Extract Docker Hub metadata
if: ${{ github.event_name != 'pull_request' && env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }}
id: meta-dockerhub
uses: docker/metadata-action@v5
with:
images: ${{ steps.images.outputs.dockerhub }}
tags: |
type=ref,event=branch
type=ref,event=tag
type=sha,prefix=sha-
type=raw,value=latest,enable={{is_default_branch}}

- name: Build and push Docker Hub image
if: ${{ github.event_name != 'pull_request' && env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }}
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
platforms: linux/amd64
push: true
tags: ${{ steps.meta-dockerhub.outputs.tags }}
labels: ${{ steps.meta-dockerhub.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ __pycache__/
venv/
env/
ENV/
.cache/

# Build and packaging
build/
Expand Down
88 changes: 88 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# syntax=docker/dockerfile:1.7

ARG CUDA_IMAGE=nvidia/cuda:12.9.1-cudnn-devel-ubuntu24.04
ARG CUDA_RUNTIME_IMAGE=nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
FROM ${CUDA_IMAGE} AS build

ARG DEBIAN_FRONTEND=noninteractive

ENV PATH=/opt/venv/bin:$PATH \
PIP_NO_CACHE_DIR=1 \
PYTHONUNBUFFERED=1

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
curl \
git \
libgl1 \
libglib2.0-0 \
python3.12 \
python3.12-dev \
python3.12-venv \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

COPY requirements-sglang.txt ./
COPY wheel/ ./wheel/
RUN python3.12 -m venv /opt/venv \
&& python -m pip install --upgrade pip setuptools wheel \
&& mkdir /wheelhouse \
&& python -m pip wheel --wheel-dir /wheelhouse -r requirements-sglang.txt

FROM ${CUDA_RUNTIME_IMAGE} AS runtime

ARG DEBIAN_FRONTEND=noninteractive
ARG USER_ID=1000
ARG GROUP_ID=1000

ENV HF_HOME=/home/unlimited/.cache/huggingface \
PATH=/opt/venv/bin:$PATH \
PYTHONUNBUFFERED=1

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
git \
libgl1 \
libglib2.0-0 \
python3.12 \
python3.12-venv \
&& rm -rf /var/lib/apt/lists/*

COPY --from=build /wheelhouse /wheelhouse
RUN python3.12 -m venv /opt/venv \
&& python -m pip install --upgrade pip setuptools wheel \
&& python -m pip install --no-index --find-links=/wheelhouse \
"sglang==0.0.0.dev11416+g92e8bb79e" \
"pymupdf==1.27.2.2" \
&& rm -rf /wheelhouse

WORKDIR /app

COPY infer.py README.md LICENSE ./

RUN groupadd --gid "${GROUP_ID}" unlimited \
&& useradd --uid "${USER_ID}" --gid "${GROUP_ID}" --create-home --shell /bin/bash unlimited \
&& mkdir -p /app/log /app/outputs "${HF_HOME}" \
&& chown -R unlimited:unlimited /app /home/unlimited

USER unlimited

EXPOSE 10000
VOLUME ["/data", "/app/outputs", "/app/log", "/home/unlimited/.cache/huggingface"]

CMD ["python", "-m", "sglang.launch_server", \
"--model", "baidu/Unlimited-OCR", \
"--trust-remote-code", \
"--served-model-name", "Unlimited-OCR", \
"--attention-backend", "fa3", \
"--page-size", "1", \
"--mem-fraction-static", "0.8", \
"--context-length", "32768", \
"--enable-custom-logit-processor", \
"--disable-overlap-schedule", \
"--skip-server-warmup", \
"--host", "0.0.0.0", \
"--port", "10000"]
73 changes: 70 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,20 @@ model.infer_multi(
### SGLang

Set up the environment (uv-managed virtualenv). Install the local SGLang wheel first,
then pin `kernels==0.9.0` and install PyMuPDF for PDF-to-image conversion:
then install PyMuPDF for PDF-to-image conversion:
```shell
uv venv --python 3.12
source .venv/bin/activate

uv pip install wheel/sglang-0.0.0.dev11416+g92e8bb79e-py3-none-any.whl
uv pip install kernels==0.11.7
uv pip install pymupdf==1.27.2.2
```

Start the SGLang server:
```shell
python -m sglang.launch_server \
--model baidu/Unlimited-OCR \
--trust-remote-code \
--served-model-name Unlimited-OCR \
--attention-backend fa3 \
--page-size 1 \
Expand Down Expand Up @@ -258,7 +258,7 @@ python infer.py \
--pdf ./examples/document.pdf \
--output_dir ./outputs \
--concurrency 8 \
--image_mode gundam
--image_mode base
```

Useful options:
Expand All @@ -268,6 +268,73 @@ Useful options:
--server_log ./log/sglang_server.log
```

### Docker

The Docker image starts the OpenAI-compatible SGLang API server by default.
It requires an NVIDIA GPU, a compatible host driver, and the NVIDIA Container Toolkit.

Build the image locally:
```shell
docker build -t unlimited-ocr:local .
```

Start the API server:
```shell
docker run --rm --gpus all --shm-size 16g \
-p 10000:10000 \
-v "$HOME/.cache/huggingface:/home/unlimited/.cache/huggingface" \
unlimited-ocr:local
```

The API is available at `http://127.0.0.1:10000` and accepts the same
OpenAI-compatible requests shown in the SGLang example above.

By default, SGLang uses one GPU. For multi-GPU inference, expose the GPUs to
Docker and add `--tensor-parallel-size N` to the server command.

You can also edit `docker-compose.yml` for local paths and run:
```shell
docker compose up
```

For batch inference, override the default command with `python infer.py`.

Run OCR for an image directory:
```shell
docker run --rm --gpus all --shm-size 16g \
-v "$PWD/examples/images:/data/images:ro" \
-v "$PWD/outputs:/app/outputs" \
-v "$PWD/log:/app/log" \
-v "$HOME/.cache/huggingface:/home/unlimited/.cache/huggingface" \
unlimited-ocr:local \
python infer.py \
--image_dir /data/images \
--output_dir /app/outputs \
--concurrency 8 \
--image_mode gundam
```

Run OCR for a PDF:
```shell
docker run --rm --gpus all --shm-size 16g \
-v "$PWD/examples/document.pdf:/data/document.pdf:ro" \
-v "$PWD/outputs:/app/outputs" \
-v "$PWD/log:/app/log" \
-v "$HOME/.cache/huggingface:/home/unlimited/.cache/huggingface" \
unlimited-ocr:local \
python infer.py \
--pdf /data/document.pdf \
--output_dir /app/outputs \
--concurrency 8 \
--image_mode base
```

The included GitHub Actions workflow builds images for pull requests and publishes
images on pushes to `main`, version tags, and manual runs:

- GitHub Container Registry: publishes to `ghcr.io/<owner>/<repo>` using the built-in `GITHUB_TOKEN`.
- Docker Hub: set repository secrets `DOCKERHUB_USERNAME` and `DOCKERHUB_TOKEN` to publish `DOCKERHUB_USERNAME/unlimited-ocr`.


## Visualization

Expand Down
24 changes: 24 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
services:
unlimited-ocr:
build:
context: .
image: unlimited-ocr:local
shm_size: 16gb
ports:
- "10000:10000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities:
- gpu
environment:
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: compute,utility
volumes:
- ./data:/data:ro
- ./outputs:/app/outputs
- ./log:/app/log
- ./.cache/huggingface:/home/unlimited/.cache/huggingface
2 changes: 2 additions & 0 deletions requirements-sglang.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
./wheel/sglang-0.0.0.dev11416+g92e8bb79e-py3-none-any.whl
pymupdf==1.27.2.2