Auto-commit: OCR workflow improvements, performance optimizations, and bug fixes

This commit is contained in:
2026-01-11 18:21:16 +08:00
parent 642dd0ea5f
commit 1ddd49f913
97 changed files with 5909 additions and 451 deletions

View File

@@ -51,6 +51,7 @@ from lightrag.api.routers.query_routes import create_query_routes
from lightrag.api.routers.graph_routes import create_graph_routes
from lightrag.api.routers.search_routes import create_search_routes
from lightrag.api.routers.ollama_api import OllamaAPI
from lightrag.api.routers.workspace_routes import router as workspace_router
from lightrag.utils import logger, set_verbose_debug
from lightrag.kg.shared_storage import (
@@ -196,8 +197,9 @@ def create_app(args):
# Check if API key is provided either through env var or args
api_key = os.getenv("LIGHTRAG_API_KEY") or args.key
# Initialize document manager with workspace support for data isolation
doc_manager = DocumentManager(args.input_dir, workspace=args.workspace)
# Create workspace manager for dynamic workspace management
from lightrag.api.workspace_manager import WorkspaceManager
workspace_manager = WorkspaceManager(args)
@asynccontextmanager
async def lifespan(app: FastAPI):
@@ -206,13 +208,21 @@ def create_app(args):
app.state.background_tasks = set()
try:
# Initialize database connections
await rag.initialize_storages()
# Initialize default workspace if specified
if args.workspace:
# Ensure default workspace exists
if not workspace_manager.workspace_exists(args.workspace):
workspace_manager.create_workspace(args.workspace)
# Get default workspace RAG instance and initialize it
default_rag = workspace_manager.get_rag(args.workspace)
await default_rag.initialize_storages()
# Data migration for default workspace
await default_rag.check_and_migrate_data()
await initialize_pipeline_status()
# Data migration regardless of storage implementation
await rag.check_and_migrate_data()
pipeline_status = await get_namespace_data("pipeline_status")
should_start_autoscan = False
@@ -224,20 +234,27 @@ def create_app(args):
should_start_autoscan = True
# Only run auto scan when no other process started it first
if should_start_autoscan:
if should_start_autoscan and args.workspace:
# Get document manager for default workspace
default_doc_manager = workspace_manager.get_document_manager(args.workspace)
default_rag = workspace_manager.get_rag(args.workspace)
# Create background task
task = asyncio.create_task(run_scanning_process(rag, doc_manager))
task = asyncio.create_task(run_scanning_process(default_rag, default_doc_manager))
app.state.background_tasks.add(task)
task.add_done_callback(app.state.background_tasks.discard)
logger.info(f"Process {os.getpid()} auto scan task started at startup.")
logger.info(f"Process {os.getpid()} auto scan task started at startup for workspace '{args.workspace}'.")
ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
yield
finally:
# Clean up database connections
await rag.finalize_storages()
# Clean up all workspace RAG instances
for workspace_name, rag_instance in workspace_manager._rag_instances.items():
try:
await rag_instance.finalize_storages()
except Exception as e:
logger.error(f"Error finalizing storages for workspace '{workspace_name}': {e}")
# Clean up shared data
finalize_share_data()
@@ -580,62 +597,69 @@ def create_app(args):
name=args.simulated_model_name, tag=args.simulated_model_tag
)
# Initialize RAG with unified configuration
try:
rag = LightRAG(
working_dir=args.working_dir,
workspace=args.workspace,
llm_model_func=create_llm_model_func(args.llm_binding),
llm_model_name=args.llm_model,
llm_model_max_async=args.max_async,
summary_max_tokens=args.summary_max_tokens,
summary_context_size=args.summary_context_size,
chunk_token_size=int(args.chunk_size),
chunk_overlap_token_size=int(args.chunk_overlap_size),
llm_model_kwargs=create_llm_model_kwargs(
args.llm_binding, args, llm_timeout
),
embedding_func=embedding_func,
default_llm_timeout=llm_timeout,
default_embedding_timeout=embedding_timeout,
kv_storage=args.kv_storage,
graph_storage=args.graph_storage,
vector_storage=args.vector_storage,
doc_status_storage=args.doc_status_storage,
vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": args.cosine_threshold
},
enable_llm_cache_for_entity_extract=args.enable_llm_cache_for_extract,
enable_llm_cache=args.enable_llm_cache,
rerank_model_func=rerank_model_func,
max_parallel_insert=args.max_parallel_insert,
max_graph_nodes=args.max_graph_nodes,
addon_params={
"language": args.summary_language,
"entity_types": args.entity_types,
},
ollama_server_infos=ollama_server_infos,
)
except Exception as e:
logger.error(f"Failed to initialize LightRAG: {e}")
raise
# Create a factory function for creating LightRAG instances with the given configuration
def create_lightrag_factory():
"""Factory function to create LightRAG instances with server configuration"""
def factory(working_dir: str, workspace: str):
return LightRAG(
working_dir=working_dir,
workspace=workspace,
llm_model_func=create_llm_model_func(args.llm_binding),
llm_model_name=args.llm_model,
llm_model_max_async=args.max_async,
summary_max_tokens=args.summary_max_tokens,
summary_context_size=args.summary_context_size,
chunk_token_size=int(args.chunk_size),
chunk_overlap_token_size=int(args.chunk_overlap_size),
llm_model_kwargs=create_llm_model_kwargs(
args.llm_binding, args, llm_timeout
),
embedding_func=embedding_func,
default_llm_timeout=llm_timeout,
default_embedding_timeout=embedding_timeout,
kv_storage=args.kv_storage,
graph_storage=args.graph_storage,
vector_storage=args.vector_storage,
doc_status_storage=args.doc_status_storage,
vector_db_storage_cls_kwargs={
"cosine_better_than_threshold": args.cosine_threshold
},
enable_llm_cache_for_entity_extract=args.enable_llm_cache_for_extract,
enable_llm_cache=args.enable_llm_cache,
rerank_model_func=rerank_model_func,
max_parallel_insert=args.max_parallel_insert,
max_graph_nodes=args.max_graph_nodes,
addon_params={
"language": args.summary_language,
"entity_types": args.entity_types,
},
ollama_server_infos=ollama_server_infos,
)
return factory
# Add routes
# Create workspace manager with LightRAG factory
workspace_manager = WorkspaceManager(args, lightrag_factory=create_lightrag_factory())
app.state.workspace_manager = workspace_manager
# Add routes with workspace manager
app.include_router(
create_document_routes(
rag,
doc_manager,
workspace_manager,
api_key,
)
)
app.include_router(create_query_routes(rag, api_key, args.top_k))
app.include_router(create_graph_routes(rag, api_key))
app.include_router(create_search_routes(rag, api_key, args.top_k))
app.include_router(create_query_routes(workspace_manager, api_key, args.top_k))
app.include_router(create_graph_routes(workspace_manager, api_key))
app.include_router(create_search_routes(workspace_manager, api_key, args.top_k))
# Add Ollama API routes
ollama_api = OllamaAPI(rag, top_k=args.top_k, api_key=api_key)
# Add Ollama API routes with workspace manager
ollama_api = OllamaAPI(workspace_manager, top_k=args.top_k, api_key=api_key)
app.include_router(ollama_api.router, prefix="/api")
# Add workspace routes
logger.info("Including workspace router")
app.include_router(workspace_router)
@app.get("/")
async def redirect_to_webui():
"""Redirect root path to /webui"""