GLiNER_Inference_Server/gliner_inference_server/main.py

from fastapi import FastAPI, HTTPException, status
from typing import List, Optional, Union, Dict, Any, Tuple
from contextlib import asynccontextmanager
import os
import torch
from gliner import GLiNER
from .models import *

# Global model instance
model = None


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Load model on startup, cleanup on shutdown"""
    global model
    print("Loading GLiNER model...")
    model_name = os.getenv("MODEL_NAME", "knowledgator/gliner-multitask-large-v0.5")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = GLiNER.from_pretrained(model_name).to(device)
    print(f"Model loaded on {device}")
    yield
    print("Shutting down...")


app = FastAPI(
    title="GLiNER Inference Server",
     description="Named Entity Recognition, Relation Extraction, and Summarization API",
     version="1.0.0",
    lifespan=lifespan
)


@app.get("/health")
async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "model_loaded": model is not None}


@app.post("/general")
async def general_extraction(request):
    """Named Entity Recognition endpoint"""
    pass


@app.post("/relation-extraction")
async def relation_extraction(request):
    """Relation Extraction endpoint"""
    pass


@app.post("/summarization")
async def summarization(request):
    """Summarization endpoint"""
    pass

if __name__ == "__main__":
    import uvicorn

    port = int(os.getenv("PORT", "8000"))
    uvicorn.run(app, host="0.0.0.0", port=port)