diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index e6435e9..4b604cd 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -3,7 +3,7 @@ name: Building Docker Image on: push: branches: - - 'temp-deployment' + - 'remove_old' jobs: docker-build: @@ -22,7 +22,7 @@ jobs: - name: Build docker container uses: docker/build-push-action@v5 with: - cache-from: type=gha - cache-to: type=gha,mode=max + # cache-from: type=gha + # cache-to: type=gha,mode=max tags: ghcr.io/meitrex/docprocai_service:latest push: true diff --git a/Dockerfile b/Dockerfile index 1c1e4a7..417b173 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,11 +15,11 @@ COPY requirements.txt . # install dependencies using pip RUN pip install --no-cache-dir -r requirements.txt -# Install torch manually, it needs a special version which supports our CUDA version. Install this AFTER the -# requirements, as some of the requirements might install torch as well, but we want to install the correct version -RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 - # copy the current directory contents into the container at /app COPY . . -CMD ["python", "./app.py"] \ No newline at end of file +# This environment varialble must be set to connect this service with the media service. +ENV media_service_url="http://app-media:3001/graphql" +ENV connection_string="user=root password=root host=database port=5432 dbname=docprocai_service" + +CMD ["python", "./app.py"] diff --git a/README.md b/README.md index 5cc6020..3222dd6 100644 --- a/README.md +++ b/README.md @@ -12,36 +12,6 @@ This service is designed to process and manage uploaded lecture material (video For a deeper dive into the features and considerations made during development, check out our paper on *DocProcAI*. -## Installation -### Neural Network Models Installation -This service requires neural network models to function at all. These models need to be downloaded and placed into a `llm_data` folder in the root. This folder is then mounted in the docker container -automatically and the files inside can then be referenced as seen in the `config.yaml` - -> [!CAUTION] -> The service cannot run without at least a sentence embedding model installed! - -> [!TIP] -> The `segment_title_generator` and `document_summary_generator` tasks only require LLMs if these features are enabled in the `config.yaml`. They are enabled by default. - -### Recommended Neural Network Models - -* For the text embedding, we recommend [Alibaba-NLP/gte-large-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5) -* For the title and summary generation, we recommend [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) -* While the title generation should work with just a base model, we recommend our custom fine-tuned LoRA Adapter for better results in the title generation task. The adapter files may be provided to you upon request. - -### GPU Acceleration -This service requires pytorch to function. As pytorch GPU-support is required for some features of this service, the pip-distributed version of pytorch cannot be used and instead a -platform-specific version has to be used. -By default, pytorch for NVIDIA CUDA 12.4 is used, as this should provide the most capability for widespread GPUs. If you need to use a different version of pytorch, you can change -the install script located in the `Dockerfile`. - -> [!WARNING] -> Note that GPU features require a supported GPU and OS to function, especially in conjunction with Docker, as the service runs in a Docker container. -> -> Docker does not provide GPU-support for MacOS at this point in time, thus GPU-features of the service do not function on MacOS. -> -> GPU features can be disabled using the `config.yaml`. Additionally, it might be necessary to change the `docker-compose.yaml` file and remove the GPU device reservation. - ## Configuration The service uses the `config.yaml` file located in the root directory for configuration. For further information about configuration check out this file, all configuration properties are explained using in-file comments. diff --git a/client/MediaServiceClient.py b/client/MediaServiceClient.py index 91bb0ca..7e8b9a5 100644 --- a/client/MediaServiceClient.py +++ b/client/MediaServiceClient.py @@ -2,9 +2,11 @@ import config import gql import gql.transport.aiohttp +import os class MediaServiceClient: + async def get_media_record_type_and_download_url(self, document_id: uuid.UUID) -> dict: self.__init_client_if_not_already() @@ -46,5 +48,5 @@ async def get_media_record_ids_of_contents(self, content_ids: list[uuid.UUID]) - return media_records def __init_client_if_not_already(self): - transport = gql.transport.aiohttp.AIOHTTPTransport(url=config.current["media_service_url"]) + transport = gql.transport.aiohttp.AIOHTTPTransport(url=os.environ.get("media_service_url")) self.__client = gql.Client(transport=transport, fetch_schema_from_transport=True) \ No newline at end of file diff --git a/config.yaml b/config.yaml index 0a3016e..cb263c9 100644 --- a/config.yaml +++ b/config.yaml @@ -1,12 +1,16 @@ # URL to the graphql endpoint of the media service. This is used to fetch information about media records the service # needs to process -media_service_url: "http://app-media:3001/graphql" +# The option was disabled as it is currently fetched from an environment variable. It will be removed in a future version. +#media_service_url: "http://app-media:3001/graphql" + + +# The option was disabled as it is currently fetched from an environment variable. It will be removed in a future version. # Settings pertaining to the database connection of the service. Note that the service expects a postgresql database # with the pgvector extension installed -database: +#database: # Connection string to the database - connection_string: "user=root password=root host=database port=5432 dbname=docprocai_service" + #connection_string: "user=root password=root host=database port=5432 dbname=docprocai_service" # Settings pertaining to the transcript generation for videos transcript_generation: @@ -31,7 +35,8 @@ text_embedding: protocol: "http" hostname: "129.69.217.248" port: 11435 - + dimensionality: 1024 + model_path: "/" # Settings pertaining to the content linking step during the content processing pipeline. Content linking is the process # of analyzing all documents and videos of a content object and checking which parts of the documents should be linked, @@ -130,4 +135,4 @@ lecture_llm_generator: I have some text extracted from the document of a lecture. Please create a very compact overview/table of contents about which topics are covered in this lecture. Your response must be under 800 characters in length.<|eot_id|><|start_header_id|>assistant<|end_header_id|> - Table of contents of the covered topics: \ No newline at end of file + Table of contents of the covered topics: diff --git a/controller/events.py b/controller/events.py index 76f63da..7fe49f8 100644 --- a/controller/events.py +++ b/controller/events.py @@ -1,4 +1,4 @@ -import uuid +from uuid import UUID from dataclasses import dataclass from enum import Enum, auto @@ -10,5 +10,5 @@ class CrudOperation(Enum): @dataclass class ContentChangeEvent: - contentIds: list[uuid] + contentIds: list[UUID] crudOperation: CrudOperation \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index eab7fe2..2d10324 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,13 +27,6 @@ services: - "9901:9901" depends_on: - database - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [ gpu ] dapr-docprocai: image: "daprio/daprd" command: [ @@ -58,4 +51,4 @@ volumes: networks: default: name: dapr-network - external: true \ No newline at end of file + external: true diff --git a/docker-compose.yml.backup b/docker-compose.yml.backup new file mode 100644 index 0000000..eab7fe2 --- /dev/null +++ b/docker-compose.yml.backup @@ -0,0 +1,61 @@ +services: + database: + image: pgvector/pgvector:pg16 + command: -c 'max_connections=500' + restart: unless-stopped + expose: + - 5432 + ports: + - "5432:5432" + volumes: + - dbdata:/var/lib/postgresql/data + - ./../docprocai_service/pg-init-scripts:/docker-entrypoint-initdb.d + environment: + - POSTGRES_USER=root + - POSTGRES_PASSWORD=root + - POSTGRES_CREATE_DB_DOCPROCAI_SERVICE=docprocai_service + app-docprocai: + build: + context: ./../docprocai_service + dockerfile: Dockerfile + restart: unless-stopped + container_name: docprocai_service + volumes: + - "./../docprocai_service/llm_data:/app/llm_data" + ports: + - "9900:9900" + - "9901:9901" + depends_on: + - database + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [ gpu ] + dapr-docprocai: + image: "daprio/daprd" + command: [ + "./daprd", + "--app-id", "docprocai_service", + "--app-port", "9901", + "--dapr-http-port", "9900", + "--resources-path", "./components" + ] + volumes: + - "./../docprocai_service/components/:/components" # Mount our components folder for the runtime to use. The mounted location must match the --resources-path argument. + depends_on: + - app-docprocai + - redis + network_mode: "service:app-docprocai" + redis: + image: "redis:alpine" + expose: + - "6379" +volumes: + dbdata: +networks: + default: + name: dapr-network + external: true \ No newline at end of file diff --git a/fileextractlib/DocumentData.py b/fileextractlib/DocumentData.py index b351224..9bc417b 100644 --- a/fileextractlib/DocumentData.py +++ b/fileextractlib/DocumentData.py @@ -1,10 +1,10 @@ from PIL.Image import Image -from torch import Tensor from typing import Optional - +from numpy.typing import NDArray +import numpy as np class PageData: - def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[Tensor]): + def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[NDArray[np.float64]]): self.page_number = page_number self.text = text self.thumbnail = thumbnail diff --git a/fileextractlib/LectureLlmGenerator.py b/fileextractlib/LectureLlmGenerator.py index fd4c1a3..257f406 100644 --- a/fileextractlib/LectureLlmGenerator.py +++ b/fileextractlib/LectureLlmGenerator.py @@ -2,38 +2,21 @@ import json from collections import OrderedDict import time -from typing import Optional +from typing import Optional, Any import pydantic -import torch.cuda import config from fileextractlib.DocumentData import DocumentData -from fileextractlib.LLMService import LlamaRunner, LLMProfile, SEGMENT_TITLE_GENERATOR_PROFILE, Hyperparameter +from fileextractlib.LLMService import LLMProfile, SEGMENT_TITLE_GENERATOR_PROFILE, Hyperparameter from fileextractlib.VideoData import VideoData import logging -from LLMService import DefaultLLMService, DOCUMENT_SUMMARY_GENERATOR_PROFILE + +from fileextractlib.LLMService import DefaultLLMService, DOCUMENT_SUMMARY_GENERATOR_PROFILE _logger = logging.getLogger(__name__) class LectureLlmGenerator: - def __init__(self): - if config.current["lecture_llm_generator"]["keep_models_loaded"]: - # if the config says that both llm generators shall use the same base & lora models, then we don't need to - # load both but can instead just use the same runner for both tasks - if ((config.current["lecture_llm_generator"]["segment_title_generator"]["base_model_path"] == - config.current["lecture_llm_generator"]["document_summary_generator"]["base_model_path"]) and - (config.current["lecture_llm_generator"]["segment_title_generator"]["lora_model_path"] == - config.current["lecture_llm_generator"]["document_summary_generator"]["lora_model_path"])): - # our unified runner for both tasks, just use the title runner, both are identical anyway - unified_runner = LectureLlmGenerator.__load_title_llama_runner() - self.__summarization_llama_runner = unified_runner - self.__title_llama_runner = unified_runner - else: - # otherwise, we'll have to load them separately - self.__summarization_llama_runner = LectureLlmGenerator.__load_summarization_llama_runner() - self.__title_llama_runner = LectureLlmGenerator.__load_title_llama_runner() - def generate_titles_for_video(self, video_data: VideoData) -> None: """ Uses an LLM to generate appropriate titles for the segments of the passed videos. Modifies the title field in @@ -144,7 +127,7 @@ def generate_summary_for_document(self, document_data: DocumentData) -> None: document_data.summary = [answer_text] @staticmethod - def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema: dict[str, any], profile: Optional[LLMProfile], hyperparameter: Optional[Hyperparameter]) -> any: + def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema: dict[str, Any], profile: Optional[LLMProfile], hyperparameter: Optional[Hyperparameter]) -> Any: generated_text = llm_service.run_custom(prompt, json.dumps(answer_schema), profile, hyperparameter) @@ -156,16 +139,4 @@ def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema answer_json = json.loads(generated_text) return answer_json except ValueError as e: - _logger.exception("Error while parsing LLM answer json.", exc_info=e) - - @staticmethod - def __load_summarization_llama_runner() -> LlamaRunner: - return LlamaRunner( - config.current["lecture_llm_generator"]["document_summary_generator"]["base_model_path"], - config.current["lecture_llm_generator"]["document_summary_generator"]["lora_model_path"]) - - @staticmethod - def __load_title_llama_runner() -> LlamaRunner: - return LlamaRunner( - config.current["lecture_llm_generator"]["segment_title_generator"]["base_model_path"], - config.current["lecture_llm_generator"]["segment_title_generator"]["lora_model_path"]) \ No newline at end of file + _logger.exception("Error while parsing LLM answer json.", exc_info=e) \ No newline at end of file diff --git a/fileextractlib/LectureVideoEmbeddingGenerator.py b/fileextractlib/LectureVideoEmbeddingGenerator.py index 3a8a5ef..81d937f 100644 --- a/fileextractlib/LectureVideoEmbeddingGenerator.py +++ b/fileextractlib/LectureVideoEmbeddingGenerator.py @@ -2,7 +2,6 @@ import time from fileextractlib.SentenceEmbeddingRunner import SentenceEmbeddingRunner -from fileextractlib.VideoProcessor import VideoProcessor from fileextractlib.VideoData import VideoSegmentData class LectureVideoEmbeddingGenerator: diff --git a/fileextractlib/SentenceEmbeddingRunner.py b/fileextractlib/SentenceEmbeddingRunner.py index c191950..c878520 100644 --- a/fileextractlib/SentenceEmbeddingRunner.py +++ b/fileextractlib/SentenceEmbeddingRunner.py @@ -1,10 +1,9 @@ from typing import List -import yaml -import urllib +import urllib.parse import requests import numpy as np -from numpy._typing import NDArray +from numpy.typing import NDArray import config @@ -20,8 +19,7 @@ def _create_url(self, words: List[str]) -> str: query_string = urllib.parse.urlencode(query_params) return f"{self.protocol}://{self.hostname}:{self.port}/embed?{query_string}" - def generate_embeddings(self, words: List[str]) -> NDArray[float]: - + def generate_embeddings(self, words: List[str]) -> NDArray[np.float64]: """ This method accepts a list of strings and computes for each its respective embedding vector. :param words: a list of words for which the respective embeddings shall be computed. diff --git a/fileextractlib/VideoData.py b/fileextractlib/VideoData.py index 64ef265..360e5be 100644 --- a/fileextractlib/VideoData.py +++ b/fileextractlib/VideoData.py @@ -1,7 +1,8 @@ import PIL.Image -from torch import Tensor from webvtt import WebVTT from typing import Optional +from numpy.typing import NDArray +import numpy as np class VideoSegmentData: """ @@ -15,13 +16,13 @@ def __init__(self, screen_text: str, thumbnail: PIL.Image.Image, title: Optional[str], - embedding: Optional[Tensor]): + embedding: Optional[NDArray[np.float64]]): self.start_time: int = start_time self.transcript: str = transcript self.screen_text: str = screen_text self.thumbnail: PIL.Image.Image = thumbnail self.title = title - self.embedding: Tensor = embedding + self.embedding: Optional[NDArray[np.float64]] = embedding class VideoData: @@ -29,7 +30,7 @@ class VideoData: Represents a video's data, containing the captions and the sections of the video. """ - def __init__(self, vtt: WebVTT, segments: list[VideoSegmentData], summary: list[str] = None): + def __init__(self, vtt: WebVTT, segments: list[VideoSegmentData], summary: Optional[list[str]] = None): if summary is None: summary = [] self.vtt: WebVTT = vtt diff --git a/persistence/SegmentDbConnector.py b/persistence/SegmentDbConnector.py index ce9e418..8849eb3 100644 --- a/persistence/SegmentDbConnector.py +++ b/persistence/SegmentDbConnector.py @@ -1,12 +1,10 @@ from typing import Optional - import psycopg - from pgvector.psycopg import register_vector - from persistence.entities import * - import config +from numpy.typing import NDArray +import numpy as np class SegmentDbConnector: @@ -75,7 +73,7 @@ def __init__(self, db_connection: psycopg.Connection): """) def insert_document_segment(self, text: str, media_record_id: UUID, page_index: int, - thumbnail: bytes, title: Optional[str], embedding: Tensor) -> None: + thumbnail: bytes, title: Optional[str], embedding: NDArray[np.float64]) -> None: self.db_connection.execute( query=""" INSERT INTO document_segments (text, media_record_id, page, thumbnail, title, embedding) @@ -84,7 +82,7 @@ def insert_document_segment(self, text: str, media_record_id: UUID, page_index: params=(text, media_record_id, page_index, thumbnail, title, embedding)) def insert_video_segment(self, screen_text: str, transcript: str, media_record_id: UUID, start_time: int, - thumbnail: bytes, title: str, embedding: Tensor) -> None: + thumbnail: bytes, title: str, embedding: NDArray[np.float64]) -> None: self.db_connection.execute( query=""" INSERT INTO video_segments ( @@ -104,7 +102,7 @@ def upsert_assessment_segment(self, task_id: UUID, assessment_id: UUID, textual_representation: str, - embedding: Tensor) -> None: + embedding: NDArray[np.float64]) -> None: # Use an upsert here instead of a regular insert because the primary key of the table isn't auto-generated but # instead manually set. Not using an upsert might result in exceptions in case we process something twice. self.db_connection.execute( @@ -222,7 +220,7 @@ def does_segment_link_exist(self, segment1_id: UUID, segment2_id: UUID, content_ return result["exists"] - def get_top_segments_by_embedding_distance(self, query_embedding: Tensor, + def get_top_segments_by_embedding_distance(self, query_embedding: NDArray[np.float64], count: int, parent_id_whitelist: list[UUID]) \ -> list[SemanticSearchResultEntity]: diff --git a/persistence/entities.py b/persistence/entities.py index 91a4daf..88188b9 100644 --- a/persistence/entities.py +++ b/persistence/entities.py @@ -1,6 +1,7 @@ from enum import Enum, auto from uuid import UUID -from torch import Tensor +from numpy.typing import NDArray +import numpy as np class MediaRecordEntity: def __init__(self, id: UUID, summary: list[str], tags: set): @@ -15,7 +16,7 @@ def __init__(self, id: UUID, tags: set): class DocumentSegmentEntity: def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, thumbnail: bytes, title: str, - embedding: Tensor): + embedding: NDArray[np.float64]): self.id = id self.media_record_id = media_record_id self.page_index = page_index @@ -27,7 +28,7 @@ def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, class VideoSegmentEntity: def __init__(self, id: UUID, media_record_id: UUID, start_time: int, transcript: str, - screen_text: str, thumbnail: bytes, title: str, embedding: Tensor): + screen_text: str, thumbnail: bytes, title: str, embedding: NDArray[np.float64]): self.id = id self.media_record_id = media_record_id self.start_time = start_time @@ -39,7 +40,7 @@ def __init__(self, id: UUID, media_record_id: UUID, start_time: int, transcript: class AssessmentSegmentEntity: - def __init__(self, id: UUID, assessment_id: UUID, textual_representation: str, embedding: Tensor): + def __init__(self, id: UUID, assessment_id: UUID, textual_representation: str, embedding: NDArray[np.float64]): self.id = id self.assessment_id = assessment_id self.textual_representation = textual_representation diff --git a/requirements.txt b/requirements.txt index bac3055..e2cfb9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ ffmpeg-python pytesseract -transformers==4.48.3 -bitsandbytes==0.44.1 pydantic lm-format-enforcer webvtt-py @@ -9,7 +7,6 @@ numpy openai-whisper Pillow pdf2image -sentence-transformers fastapi psycopg[binary] uvicorn @@ -22,9 +19,7 @@ opencv-python requests tika pypdf -peft pyyaml -torch annotated-types bertopic scikit-learn diff --git a/service/DocProcAiService.py b/service/DocProcAiService.py index 986c96d..579ebc4 100644 --- a/service/DocProcAiService.py +++ b/service/DocProcAiService.py @@ -5,6 +5,7 @@ import threading import time import uuid +import os from time import sleep from typing import Callable, Self, Awaitable, Optional @@ -15,7 +16,6 @@ import config import dto import dto.mapper as mapper -import fileextractlib.LLMService as LlamaRunner from fileextractlib.TopicModel import TopicModel from dto import MediaRecordSegmentLinkDto, SemanticSearchResultDto, \ AiEntityProcessingProgressDto, MediaRecordSegmentDto, TaskInformationDto @@ -32,14 +32,17 @@ from persistence.entities import * from utils.SortedPriorityQueue import SortedPriorityQueue from controller.events import ContentChangeEvent, CrudOperation +from numpy.typing import NDArray +import numpy as np _logger = logging.getLogger(__name__) class DocProcAiService: + def __init__(self): self.database_connection = psycopg.connect( - config.current["database"]["connection_string"], + os.environ.get("connection_string"), autocommit=True, row_factory=psycopg.rows.dict_row ) @@ -52,9 +55,6 @@ def __init__(self): # graphql client for interacting with the media service self.__media_service_client: MediaServiceClient.MediaServiceClient = MediaServiceClient.MediaServiceClient() - # only load the llamaRunner the first time we actually need it, not now - self.__llama_runner: LlamaRunner.LlamaRunner | None = None - self.__sentence_embedding_runner: SentenceEmbeddingRunner = SentenceEmbeddingRunner() self.__lecture_pdf_embedding_generator: LectureDocumentEmbeddingGenerator = LectureDocumentEmbeddingGenerator() @@ -78,7 +78,7 @@ def __init__(self): self.__background_task_thread.start() def __del__(self): - self._keep_background_task_thread_alive = False + self._keep_background_task_thread_alive.clear() def enqueue_ingest_media_record_task(self, media_record_id: uuid.UUID) -> None: """ @@ -247,7 +247,7 @@ async def generate_assessment_segments_task() -> None: sentence_embedding_runner: SentenceEmbeddingRunner = SentenceEmbeddingRunner() for task_information in task_information_list: - embedding: Tensor = sentence_embedding_runner.generate_embeddings( + embedding: NDArray[np.float64] = sentence_embedding_runner.generate_embeddings( [task_information["textualRepresentation"]])[0] self.segment_database.upsert_assessment_segment(task_information["taskId"],