--- olmocr/pipeline.py.orig 2026-03-12 16:30:29 UTC +++ olmocr/pipeline.py @@ -811,7 +811,7 @@ async def vllm_server_task(model_name_or_path, args, u model_name_or_path, "--port", str(args.port), - "--disable-log-requests", + "--no-enable-log-requests", "--uvicorn-log-level", "warning", "--served-model-name", @@ -833,12 +833,15 @@ async def vllm_server_task(model_name_or_path, args, u if unknown_args: cmd.extend(unknown_args) + if getattr(args, "device", "cpu") == "cpu": + cmd.append("--enforce-eager") + proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, # OMP_NUM_THREADS needs to be 1, otherwise you could have contention if you are running multiple copies of olmOCR on a machine with several GPUS - env={**os.environ, "OMP_NUM_THREADS": "1"}, + env={**os.environ, "OMP_NUM_THREADS": str(os.cpu_count()) if getattr(args, "device", "cpu") == "cpu" else "1"}, ) # Ensure the subprocess is terminated on exit @@ -1211,7 +1214,7 @@ async def main(): parser.add_argument( "--model", help="Path where the model is located, allenai/olmOCR-2-7B-1025-FP8 is the default, can be local, s3, or hugging face.", - default="allenai/olmOCR-2-7B-1025-FP8", + default="allenai/olmOCR-2-7B-1025", ) # More detailed config options, usually you shouldn't have to change these @@ -1256,6 +1259,7 @@ async def main(): vllm_group.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="Tensor parallel size for vLLM") vllm_group.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM") vllm_group.add_argument("--port", type=int, default=30024, help="Port to use for the VLLM server") + vllm_group.add_argument("--device", type=str, default="cpu", help="Device to use for inference (cpu, cuda, etc.)") # Beaker/job running stuff beaker_group = parser.add_argument_group("beaker/cluster execution") @@ -1421,7 +1425,7 @@ async def main(): # If you get this far, then you are doing inference and need a GPU # check_sglang_version() - if use_internal_server: + if use_internal_server and args.device != "cpu": check_torch_gpu_available() logger.info(f"Starting pipeline with PID {os.getpid()}")