From b6dd7c5d446da6c7ad8ae88fa778f905bb8eda2d Mon Sep 17 00:00:00 2001 From: myluki2000 Date: Fri, 18 Jul 2025 10:37:51 +0200 Subject: [PATCH 01/13] Remove unnecessary old stuff, fix some typing issues --- Dockerfile | 4 -- controller/events.py | 4 +- fileextractlib/DocumentData.py | 6 +-- fileextractlib/LectureLlmGenerator.py | 38 ++----------------- .../LectureVideoEmbeddingGenerator.py | 1 - fileextractlib/SentenceEmbeddingRunner.py | 8 ++-- fileextractlib/VideoData.py | 9 +++-- persistence/SegmentDbConnector.py | 14 +++---- persistence/entities.py | 9 +++-- requirements.txt | 5 --- service/DocProcAiService.py | 9 ++--- 11 files changed, 32 insertions(+), 75 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1c1e4a7..36fc051 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,10 +15,6 @@ COPY requirements.txt . # install dependencies using pip RUN pip install --no-cache-dir -r requirements.txt -# Install torch manually, it needs a special version which supports our CUDA version. Install this AFTER the -# requirements, as some of the requirements might install torch as well, but we want to install the correct version -RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 - # copy the current directory contents into the container at /app COPY . . diff --git a/controller/events.py b/controller/events.py index 76f63da..7fe49f8 100644 --- a/controller/events.py +++ b/controller/events.py @@ -1,4 +1,4 @@ -import uuid +from uuid import UUID from dataclasses import dataclass from enum import Enum, auto @@ -10,5 +10,5 @@ class CrudOperation(Enum): @dataclass class ContentChangeEvent: - contentIds: list[uuid] + contentIds: list[UUID] crudOperation: CrudOperation \ No newline at end of file diff --git a/fileextractlib/DocumentData.py b/fileextractlib/DocumentData.py index b351224..9d9e93a 100644 --- a/fileextractlib/DocumentData.py +++ b/fileextractlib/DocumentData.py @@ -1,10 +1,10 @@ from PIL.Image import Image -from torch import Tensor from typing import Optional - +from numpy.typing import NDArray +import numpy as np class PageData: - def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[Tensor]): + def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[NDArray[np.float_]]): self.page_number = page_number self.text = text self.thumbnail = thumbnail diff --git a/fileextractlib/LectureLlmGenerator.py b/fileextractlib/LectureLlmGenerator.py index fd4c1a3..ced1a32 100644 --- a/fileextractlib/LectureLlmGenerator.py +++ b/fileextractlib/LectureLlmGenerator.py @@ -2,13 +2,12 @@ import json from collections import OrderedDict import time -from typing import Optional +from typing import Optional, Any import pydantic -import torch.cuda import config from fileextractlib.DocumentData import DocumentData -from fileextractlib.LLMService import LlamaRunner, LLMProfile, SEGMENT_TITLE_GENERATOR_PROFILE, Hyperparameter +from fileextractlib.LLMService import LLMProfile, SEGMENT_TITLE_GENERATOR_PROFILE, Hyperparameter from fileextractlib.VideoData import VideoData import logging @@ -17,23 +16,6 @@ _logger = logging.getLogger(__name__) class LectureLlmGenerator: - def __init__(self): - if config.current["lecture_llm_generator"]["keep_models_loaded"]: - # if the config says that both llm generators shall use the same base & lora models, then we don't need to - # load both but can instead just use the same runner for both tasks - if ((config.current["lecture_llm_generator"]["segment_title_generator"]["base_model_path"] == - config.current["lecture_llm_generator"]["document_summary_generator"]["base_model_path"]) and - (config.current["lecture_llm_generator"]["segment_title_generator"]["lora_model_path"] == - config.current["lecture_llm_generator"]["document_summary_generator"]["lora_model_path"])): - # our unified runner for both tasks, just use the title runner, both are identical anyway - unified_runner = LectureLlmGenerator.__load_title_llama_runner() - self.__summarization_llama_runner = unified_runner - self.__title_llama_runner = unified_runner - else: - # otherwise, we'll have to load them separately - self.__summarization_llama_runner = LectureLlmGenerator.__load_summarization_llama_runner() - self.__title_llama_runner = LectureLlmGenerator.__load_title_llama_runner() - def generate_titles_for_video(self, video_data: VideoData) -> None: """ Uses an LLM to generate appropriate titles for the segments of the passed videos. Modifies the title field in @@ -144,7 +126,7 @@ def generate_summary_for_document(self, document_data: DocumentData) -> None: document_data.summary = [answer_text] @staticmethod - def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema: dict[str, any], profile: Optional[LLMProfile], hyperparameter: Optional[Hyperparameter]) -> any: + def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema: dict[str, Any], profile: Optional[LLMProfile], hyperparameter: Optional[Hyperparameter]) -> Any: generated_text = llm_service.run_custom(prompt, json.dumps(answer_schema), profile, hyperparameter) @@ -156,16 +138,4 @@ def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema answer_json = json.loads(generated_text) return answer_json except ValueError as e: - _logger.exception("Error while parsing LLM answer json.", exc_info=e) - - @staticmethod - def __load_summarization_llama_runner() -> LlamaRunner: - return LlamaRunner( - config.current["lecture_llm_generator"]["document_summary_generator"]["base_model_path"], - config.current["lecture_llm_generator"]["document_summary_generator"]["lora_model_path"]) - - @staticmethod - def __load_title_llama_runner() -> LlamaRunner: - return LlamaRunner( - config.current["lecture_llm_generator"]["segment_title_generator"]["base_model_path"], - config.current["lecture_llm_generator"]["segment_title_generator"]["lora_model_path"]) \ No newline at end of file + _logger.exception("Error while parsing LLM answer json.", exc_info=e) \ No newline at end of file diff --git a/fileextractlib/LectureVideoEmbeddingGenerator.py b/fileextractlib/LectureVideoEmbeddingGenerator.py index 3a8a5ef..81d937f 100644 --- a/fileextractlib/LectureVideoEmbeddingGenerator.py +++ b/fileextractlib/LectureVideoEmbeddingGenerator.py @@ -2,7 +2,6 @@ import time from fileextractlib.SentenceEmbeddingRunner import SentenceEmbeddingRunner -from fileextractlib.VideoProcessor import VideoProcessor from fileextractlib.VideoData import VideoSegmentData class LectureVideoEmbeddingGenerator: diff --git a/fileextractlib/SentenceEmbeddingRunner.py b/fileextractlib/SentenceEmbeddingRunner.py index c191950..223e434 100644 --- a/fileextractlib/SentenceEmbeddingRunner.py +++ b/fileextractlib/SentenceEmbeddingRunner.py @@ -1,10 +1,9 @@ from typing import List -import yaml -import urllib +import urllib.parse import requests import numpy as np -from numpy._typing import NDArray +from numpy.typing import NDArray import config @@ -20,8 +19,7 @@ def _create_url(self, words: List[str]) -> str: query_string = urllib.parse.urlencode(query_params) return f"{self.protocol}://{self.hostname}:{self.port}/embed?{query_string}" - def generate_embeddings(self, words: List[str]) -> NDArray[float]: - + def generate_embeddings(self, words: List[str]) -> NDArray[np.float_]: """ This method accepts a list of strings and computes for each its respective embedding vector. :param words: a list of words for which the respective embeddings shall be computed. diff --git a/fileextractlib/VideoData.py b/fileextractlib/VideoData.py index 64ef265..4efa227 100644 --- a/fileextractlib/VideoData.py +++ b/fileextractlib/VideoData.py @@ -1,7 +1,8 @@ import PIL.Image -from torch import Tensor from webvtt import WebVTT from typing import Optional +from numpy.typing import NDArray +import numpy as np class VideoSegmentData: """ @@ -15,13 +16,13 @@ def __init__(self, screen_text: str, thumbnail: PIL.Image.Image, title: Optional[str], - embedding: Optional[Tensor]): + embedding: Optional[NDArray[np.float_]]): self.start_time: int = start_time self.transcript: str = transcript self.screen_text: str = screen_text self.thumbnail: PIL.Image.Image = thumbnail self.title = title - self.embedding: Tensor = embedding + self.embedding: Optional[NDArray[np.float_]] = embedding class VideoData: @@ -29,7 +30,7 @@ class VideoData: Represents a video's data, containing the captions and the sections of the video. """ - def __init__(self, vtt: WebVTT, segments: list[VideoSegmentData], summary: list[str] = None): + def __init__(self, vtt: WebVTT, segments: list[VideoSegmentData], summary: Optional[list[str]] = None): if summary is None: summary = [] self.vtt: WebVTT = vtt diff --git a/persistence/SegmentDbConnector.py b/persistence/SegmentDbConnector.py index ce9e418..bb2da91 100644 --- a/persistence/SegmentDbConnector.py +++ b/persistence/SegmentDbConnector.py @@ -1,12 +1,10 @@ from typing import Optional - import psycopg - from pgvector.psycopg import register_vector - from persistence.entities import * - import config +from numpy.typing import NDArray +import numpy as np class SegmentDbConnector: @@ -75,7 +73,7 @@ def __init__(self, db_connection: psycopg.Connection): """) def insert_document_segment(self, text: str, media_record_id: UUID, page_index: int, - thumbnail: bytes, title: Optional[str], embedding: Tensor) -> None: + thumbnail: bytes, title: Optional[str], embedding: NDArray[np.float_]) -> None: self.db_connection.execute( query=""" INSERT INTO document_segments (text, media_record_id, page, thumbnail, title, embedding) @@ -84,7 +82,7 @@ def insert_document_segment(self, text: str, media_record_id: UUID, page_index: params=(text, media_record_id, page_index, thumbnail, title, embedding)) def insert_video_segment(self, screen_text: str, transcript: str, media_record_id: UUID, start_time: int, - thumbnail: bytes, title: str, embedding: Tensor) -> None: + thumbnail: bytes, title: str, embedding: NDArray[np.float_]) -> None: self.db_connection.execute( query=""" INSERT INTO video_segments ( @@ -104,7 +102,7 @@ def upsert_assessment_segment(self, task_id: UUID, assessment_id: UUID, textual_representation: str, - embedding: Tensor) -> None: + embedding: NDArray[np.float_]) -> None: # Use an upsert here instead of a regular insert because the primary key of the table isn't auto-generated but # instead manually set. Not using an upsert might result in exceptions in case we process something twice. self.db_connection.execute( @@ -222,7 +220,7 @@ def does_segment_link_exist(self, segment1_id: UUID, segment2_id: UUID, content_ return result["exists"] - def get_top_segments_by_embedding_distance(self, query_embedding: Tensor, + def get_top_segments_by_embedding_distance(self, query_embedding: NDArray[np.float_], count: int, parent_id_whitelist: list[UUID]) \ -> list[SemanticSearchResultEntity]: diff --git a/persistence/entities.py b/persistence/entities.py index 91a4daf..4df2bef 100644 --- a/persistence/entities.py +++ b/persistence/entities.py @@ -1,6 +1,7 @@ from enum import Enum, auto from uuid import UUID -from torch import Tensor +from numpy.typing import NDArray +import numpy as np class MediaRecordEntity: def __init__(self, id: UUID, summary: list[str], tags: set): @@ -15,7 +16,7 @@ def __init__(self, id: UUID, tags: set): class DocumentSegmentEntity: def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, thumbnail: bytes, title: str, - embedding: Tensor): + embedding: NDArray[np.float_]): self.id = id self.media_record_id = media_record_id self.page_index = page_index @@ -27,7 +28,7 @@ def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, class VideoSegmentEntity: def __init__(self, id: UUID, media_record_id: UUID, start_time: int, transcript: str, - screen_text: str, thumbnail: bytes, title: str, embedding: Tensor): + screen_text: str, thumbnail: bytes, title: str, embedding: NDArray[np.float_]): self.id = id self.media_record_id = media_record_id self.start_time = start_time @@ -39,7 +40,7 @@ def __init__(self, id: UUID, media_record_id: UUID, start_time: int, transcript: class AssessmentSegmentEntity: - def __init__(self, id: UUID, assessment_id: UUID, textual_representation: str, embedding: Tensor): + def __init__(self, id: UUID, assessment_id: UUID, textual_representation: str, embedding: NDArray[np.float_]): self.id = id self.assessment_id = assessment_id self.textual_representation = textual_representation diff --git a/requirements.txt b/requirements.txt index bac3055..e2cfb9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ ffmpeg-python pytesseract -transformers==4.48.3 -bitsandbytes==0.44.1 pydantic lm-format-enforcer webvtt-py @@ -9,7 +7,6 @@ numpy openai-whisper Pillow pdf2image -sentence-transformers fastapi psycopg[binary] uvicorn @@ -22,9 +19,7 @@ opencv-python requests tika pypdf -peft pyyaml -torch annotated-types bertopic scikit-learn diff --git a/service/DocProcAiService.py b/service/DocProcAiService.py index 986c96d..7d2b044 100644 --- a/service/DocProcAiService.py +++ b/service/DocProcAiService.py @@ -32,6 +32,8 @@ from persistence.entities import * from utils.SortedPriorityQueue import SortedPriorityQueue from controller.events import ContentChangeEvent, CrudOperation +from numpy.typing import NDArray +import numpy as np _logger = logging.getLogger(__name__) @@ -52,9 +54,6 @@ def __init__(self): # graphql client for interacting with the media service self.__media_service_client: MediaServiceClient.MediaServiceClient = MediaServiceClient.MediaServiceClient() - # only load the llamaRunner the first time we actually need it, not now - self.__llama_runner: LlamaRunner.LlamaRunner | None = None - self.__sentence_embedding_runner: SentenceEmbeddingRunner = SentenceEmbeddingRunner() self.__lecture_pdf_embedding_generator: LectureDocumentEmbeddingGenerator = LectureDocumentEmbeddingGenerator() @@ -78,7 +77,7 @@ def __init__(self): self.__background_task_thread.start() def __del__(self): - self._keep_background_task_thread_alive = False + self._keep_background_task_thread_alive.clear() def enqueue_ingest_media_record_task(self, media_record_id: uuid.UUID) -> None: """ @@ -247,7 +246,7 @@ async def generate_assessment_segments_task() -> None: sentence_embedding_runner: SentenceEmbeddingRunner = SentenceEmbeddingRunner() for task_information in task_information_list: - embedding: Tensor = sentence_embedding_runner.generate_embeddings( + embedding: NDArray[np.float_] = sentence_embedding_runner.generate_embeddings( [task_information["textualRepresentation"]])[0] self.segment_database.upsert_assessment_segment(task_information["taskId"], From 5c513475a1c9679af9639ecb60dbde6c527ea473 Mon Sep 17 00:00:00 2001 From: Pascal Schur <36639504+schurpl@users.noreply.github.com> Date: Fri, 18 Jul 2025 10:40:09 +0200 Subject: [PATCH 02/13] Update docker-build.yml --- .github/workflows/docker-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index e6435e9..afbd0a8 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -3,7 +3,7 @@ name: Building Docker Image on: push: branches: - - 'temp-deployment' + - 'remove_old' jobs: docker-build: From d779e23ed89757d8c43104c60ebdcc35407dbd92 Mon Sep 17 00:00:00 2001 From: Pascal Schur <36639504+schurpl@users.noreply.github.com> Date: Fri, 18 Jul 2025 10:43:48 +0200 Subject: [PATCH 03/13] Update config.yaml --- config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yaml b/config.yaml index 0a3016e..2c008b5 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,6 @@ # URL to the graphql endpoint of the media service. This is used to fetch information about media records the service # needs to process -media_service_url: "http://app-media:3001/graphql" +media_service_url: "http://localhost:3500/v1.0/invoke/docprocai-service/method/graphql" # Settings pertaining to the database connection of the service. Note that the service expects a postgresql database # with the pgvector extension installed @@ -130,4 +130,4 @@ lecture_llm_generator: I have some text extracted from the document of a lecture. Please create a very compact overview/table of contents about which topics are covered in this lecture. Your response must be under 800 characters in length.<|eot_id|><|start_header_id|>assistant<|end_header_id|> - Table of contents of the covered topics: \ No newline at end of file + Table of contents of the covered topics: From 86fe09b620657e7b5dceaa551ebc5ec8d9e98422 Mon Sep 17 00:00:00 2001 From: Pascal Schur <36639504+schurpl@users.noreply.github.com> Date: Fri, 18 Jul 2025 11:53:11 +0200 Subject: [PATCH 04/13] Downgrade numpy to version 1.26.4 Downgrade numpy to version 1.26.4 because the method in entities.py is used, which was removed in numpy 2.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e2cfb9f..18b9d1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pytesseract pydantic lm-format-enforcer webvtt-py -numpy +numpy == 1.26.4 openai-whisper Pillow pdf2image From 83d644b92375ceb1dba865c4361393a9e635f970 Mon Sep 17 00:00:00 2001 From: Pascal Schur <36639504+schurpl@users.noreply.github.com> Date: Fri, 18 Jul 2025 12:25:36 +0200 Subject: [PATCH 05/13] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 18b9d1e..e2cfb9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pytesseract pydantic lm-format-enforcer webvtt-py -numpy == 1.26.4 +numpy openai-whisper Pillow pdf2image From da7017c4cee2c958570ae8884303295a8f37ded9 Mon Sep 17 00:00:00 2001 From: myluki2000 Date: Fri, 18 Jul 2025 12:46:08 +0200 Subject: [PATCH 06/13] change numpy.float_ to float64 --- service/DocProcAiService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/DocProcAiService.py b/service/DocProcAiService.py index 7d2b044..6c11f7e 100644 --- a/service/DocProcAiService.py +++ b/service/DocProcAiService.py @@ -246,7 +246,7 @@ async def generate_assessment_segments_task() -> None: sentence_embedding_runner: SentenceEmbeddingRunner = SentenceEmbeddingRunner() for task_information in task_information_list: - embedding: NDArray[np.float_] = sentence_embedding_runner.generate_embeddings( + embedding: NDArray[np.float64] = sentence_embedding_runner.generate_embeddings( [task_information["textualRepresentation"]])[0] self.segment_database.upsert_assessment_segment(task_information["taskId"], From c64c3d4ecb8e5c8d7ec232367ba4cf1ed237fd4a Mon Sep 17 00:00:00 2001 From: Pascal Schur <36639504+schurpl@users.noreply.github.com> Date: Fri, 18 Jul 2025 13:31:49 +0200 Subject: [PATCH 07/13] Update docker-build.yml --- .github/workflows/docker-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index afbd0a8..4b604cd 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -22,7 +22,7 @@ jobs: - name: Build docker container uses: docker/build-push-action@v5 with: - cache-from: type=gha - cache-to: type=gha,mode=max + # cache-from: type=gha + # cache-to: type=gha,mode=max tags: ghcr.io/meitrex/docprocai_service:latest push: true From d9e168c0f6e12296f23eb0c92eb4a24fc78863f5 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Wed, 30 Jul 2025 11:06:32 +0200 Subject: [PATCH 08/13] Replaced float_ by float64. --- fileextractlib/DocumentData.py | 2 +- fileextractlib/SentenceEmbeddingRunner.py | 2 +- fileextractlib/VideoData.py | 4 ++-- persistence/SegmentDbConnector.py | 8 ++++---- persistence/entities.py | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fileextractlib/DocumentData.py b/fileextractlib/DocumentData.py index 9d9e93a..9bc417b 100644 --- a/fileextractlib/DocumentData.py +++ b/fileextractlib/DocumentData.py @@ -4,7 +4,7 @@ import numpy as np class PageData: - def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[NDArray[np.float_]]): + def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[NDArray[np.float64]]): self.page_number = page_number self.text = text self.thumbnail = thumbnail diff --git a/fileextractlib/SentenceEmbeddingRunner.py b/fileextractlib/SentenceEmbeddingRunner.py index 223e434..c878520 100644 --- a/fileextractlib/SentenceEmbeddingRunner.py +++ b/fileextractlib/SentenceEmbeddingRunner.py @@ -19,7 +19,7 @@ def _create_url(self, words: List[str]) -> str: query_string = urllib.parse.urlencode(query_params) return f"{self.protocol}://{self.hostname}:{self.port}/embed?{query_string}" - def generate_embeddings(self, words: List[str]) -> NDArray[np.float_]: + def generate_embeddings(self, words: List[str]) -> NDArray[np.float64]: """ This method accepts a list of strings and computes for each its respective embedding vector. :param words: a list of words for which the respective embeddings shall be computed. diff --git a/fileextractlib/VideoData.py b/fileextractlib/VideoData.py index 4efa227..360e5be 100644 --- a/fileextractlib/VideoData.py +++ b/fileextractlib/VideoData.py @@ -16,13 +16,13 @@ def __init__(self, screen_text: str, thumbnail: PIL.Image.Image, title: Optional[str], - embedding: Optional[NDArray[np.float_]]): + embedding: Optional[NDArray[np.float64]]): self.start_time: int = start_time self.transcript: str = transcript self.screen_text: str = screen_text self.thumbnail: PIL.Image.Image = thumbnail self.title = title - self.embedding: Optional[NDArray[np.float_]] = embedding + self.embedding: Optional[NDArray[np.float64]] = embedding class VideoData: diff --git a/persistence/SegmentDbConnector.py b/persistence/SegmentDbConnector.py index bb2da91..8849eb3 100644 --- a/persistence/SegmentDbConnector.py +++ b/persistence/SegmentDbConnector.py @@ -73,7 +73,7 @@ def __init__(self, db_connection: psycopg.Connection): """) def insert_document_segment(self, text: str, media_record_id: UUID, page_index: int, - thumbnail: bytes, title: Optional[str], embedding: NDArray[np.float_]) -> None: + thumbnail: bytes, title: Optional[str], embedding: NDArray[np.float64]) -> None: self.db_connection.execute( query=""" INSERT INTO document_segments (text, media_record_id, page, thumbnail, title, embedding) @@ -82,7 +82,7 @@ def insert_document_segment(self, text: str, media_record_id: UUID, page_index: params=(text, media_record_id, page_index, thumbnail, title, embedding)) def insert_video_segment(self, screen_text: str, transcript: str, media_record_id: UUID, start_time: int, - thumbnail: bytes, title: str, embedding: NDArray[np.float_]) -> None: + thumbnail: bytes, title: str, embedding: NDArray[np.float64]) -> None: self.db_connection.execute( query=""" INSERT INTO video_segments ( @@ -102,7 +102,7 @@ def upsert_assessment_segment(self, task_id: UUID, assessment_id: UUID, textual_representation: str, - embedding: NDArray[np.float_]) -> None: + embedding: NDArray[np.float64]) -> None: # Use an upsert here instead of a regular insert because the primary key of the table isn't auto-generated but # instead manually set. Not using an upsert might result in exceptions in case we process something twice. self.db_connection.execute( @@ -220,7 +220,7 @@ def does_segment_link_exist(self, segment1_id: UUID, segment2_id: UUID, content_ return result["exists"] - def get_top_segments_by_embedding_distance(self, query_embedding: NDArray[np.float_], + def get_top_segments_by_embedding_distance(self, query_embedding: NDArray[np.float64], count: int, parent_id_whitelist: list[UUID]) \ -> list[SemanticSearchResultEntity]: diff --git a/persistence/entities.py b/persistence/entities.py index 4df2bef..88188b9 100644 --- a/persistence/entities.py +++ b/persistence/entities.py @@ -16,7 +16,7 @@ def __init__(self, id: UUID, tags: set): class DocumentSegmentEntity: def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, thumbnail: bytes, title: str, - embedding: NDArray[np.float_]): + embedding: NDArray[np.float64]): self.id = id self.media_record_id = media_record_id self.page_index = page_index @@ -28,7 +28,7 @@ def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, class VideoSegmentEntity: def __init__(self, id: UUID, media_record_id: UUID, start_time: int, transcript: str, - screen_text: str, thumbnail: bytes, title: str, embedding: NDArray[np.float_]): + screen_text: str, thumbnail: bytes, title: str, embedding: NDArray[np.float64]): self.id = id self.media_record_id = media_record_id self.start_time = start_time @@ -40,7 +40,7 @@ def __init__(self, id: UUID, media_record_id: UUID, start_time: int, transcript: class AssessmentSegmentEntity: - def __init__(self, id: UUID, assessment_id: UUID, textual_representation: str, embedding: NDArray[np.float_]): + def __init__(self, id: UUID, assessment_id: UUID, textual_representation: str, embedding: NDArray[np.float64]): self.id = id self.assessment_id = assessment_id self.textual_representation = textual_representation From 9cd1fb468cb60b6e6051e9295a5e23a49a90e651 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Wed, 30 Jul 2025 14:05:28 +0200 Subject: [PATCH 09/13] Broken imports fixed. --- config.yaml | 52 +++++++++++++++++++++++++-- fileextractlib/LectureLlmGenerator.py | 3 +- service/DocProcAiService.py | 1 - 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/config.yaml b/config.yaml index 2c008b5..b124b25 100644 --- a/config.yaml +++ b/config.yaml @@ -31,7 +31,7 @@ text_embedding: protocol: "http" hostname: "129.69.217.248" port: 11435 - + dimensionality: 1024 # Settings pertaining to the content linking step during the content processing pipeline. Content linking is the process # of analyzing all documents and videos of a content object and checking which parts of the documents should be linked, @@ -99,7 +99,55 @@ lecture_llm_generator: ```<|eot_id|><|start_header_id|>assistant<|end_header_id|> # Settings pertaining to the generation of summaries for documents - document_summary_generator: + document_summary_generator:services: + database: + image: pgvector/pgvector:pg16 + command: -c 'max_connections=500' + restart: unless-stopped + expose: + - 5432 + ports: + - "5432:5432" + volumes: + - dbdata:/var/lib/postgresql/data + - ./../docprocai_service/pg-init-scripts:/docker-entrypoint-initdb.d + environment: + - POSTGRES_USER=root + - POSTGRES_PASSWORD=root + - POSTGRES_CREATE_DB_DOCPROCAI_SERVICE=docprocai_service + app-docprocai: + build: + context: ./../docprocai_service + dockerfile: Dockerfile + restart: unless-stopped + container_name: docprocai_service + volumes: + - "./../docprocai_service/llm_data:/app/llm_data" + ports: + - "9900:9900" + - "9901:9901" + depends_on: + - database + dapr-docprocai: + image: "daprio/daprd" + command: [ + "./daprd", + "--app-id", "docprocai_service", + "--app-port", "9901", + "--dapr-http-port", "9900", + "--resources-path", "./components" + ] + volumes: + - "./../docprocai_service/components/:/components" # Mount our components folder for the runtime to use. The mounted location must match the --resources-path argument. + depends_on: + - app-docprocai + - redis + network_mode: "service:app-docprocai" + redis: + image: "redis:alpine" + expose: + - "6379" + # If true, llm features are enabled in the processing pipeline. If false, these steps are skipped in the pipeline # and relevant data attributes replaced by placeholders or left unset # Default: true diff --git a/fileextractlib/LectureLlmGenerator.py b/fileextractlib/LectureLlmGenerator.py index ced1a32..257f406 100644 --- a/fileextractlib/LectureLlmGenerator.py +++ b/fileextractlib/LectureLlmGenerator.py @@ -11,7 +11,8 @@ from fileextractlib.VideoData import VideoData import logging -from LLMService import DefaultLLMService, DOCUMENT_SUMMARY_GENERATOR_PROFILE + +from fileextractlib.LLMService import DefaultLLMService, DOCUMENT_SUMMARY_GENERATOR_PROFILE _logger = logging.getLogger(__name__) diff --git a/service/DocProcAiService.py b/service/DocProcAiService.py index 6c11f7e..46a9ed7 100644 --- a/service/DocProcAiService.py +++ b/service/DocProcAiService.py @@ -15,7 +15,6 @@ import config import dto import dto.mapper as mapper -import fileextractlib.LLMService as LlamaRunner from fileextractlib.TopicModel import TopicModel from dto import MediaRecordSegmentLinkDto, SemanticSearchResultDto, \ AiEntityProcessingProgressDto, MediaRecordSegmentDto, TaskInformationDto From f171923d3f15912638e9457813db686ea1487155 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Tue, 12 Aug 2025 00:02:40 +0200 Subject: [PATCH 10/13] Fixed invalid media service url. --- config.yaml | 5 +++- docker-compose.yml | 9 +----- docker-compose.yml.backup | 61 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 9 deletions(-) create mode 100644 docker-compose.yml.backup diff --git a/config.yaml b/config.yaml index b124b25..02c6e77 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,8 @@ # URL to the graphql endpoint of the media service. This is used to fetch information about media records the service # needs to process -media_service_url: "http://localhost:3500/v1.0/invoke/docprocai-service/method/graphql" +#media_service_url: "http://localhost:3500/v1.0/invoke/docprocai-service/method/graphql" + +media_service_url: "http://app-media:3001/graphql" # Settings pertaining to the database connection of the service. Note that the service expects a postgresql database # with the pgvector extension installed @@ -32,6 +34,7 @@ text_embedding: hostname: "129.69.217.248" port: 11435 dimensionality: 1024 + model_path: "/" # Settings pertaining to the content linking step during the content processing pipeline. Content linking is the process # of analyzing all documents and videos of a content object and checking which parts of the documents should be linked, diff --git a/docker-compose.yml b/docker-compose.yml index eab7fe2..2d10324 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,13 +27,6 @@ services: - "9901:9901" depends_on: - database - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [ gpu ] dapr-docprocai: image: "daprio/daprd" command: [ @@ -58,4 +51,4 @@ volumes: networks: default: name: dapr-network - external: true \ No newline at end of file + external: true diff --git a/docker-compose.yml.backup b/docker-compose.yml.backup new file mode 100644 index 0000000..eab7fe2 --- /dev/null +++ b/docker-compose.yml.backup @@ -0,0 +1,61 @@ +services: + database: + image: pgvector/pgvector:pg16 + command: -c 'max_connections=500' + restart: unless-stopped + expose: + - 5432 + ports: + - "5432:5432" + volumes: + - dbdata:/var/lib/postgresql/data + - ./../docprocai_service/pg-init-scripts:/docker-entrypoint-initdb.d + environment: + - POSTGRES_USER=root + - POSTGRES_PASSWORD=root + - POSTGRES_CREATE_DB_DOCPROCAI_SERVICE=docprocai_service + app-docprocai: + build: + context: ./../docprocai_service + dockerfile: Dockerfile + restart: unless-stopped + container_name: docprocai_service + volumes: + - "./../docprocai_service/llm_data:/app/llm_data" + ports: + - "9900:9900" + - "9901:9901" + depends_on: + - database + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [ gpu ] + dapr-docprocai: + image: "daprio/daprd" + command: [ + "./daprd", + "--app-id", "docprocai_service", + "--app-port", "9901", + "--dapr-http-port", "9900", + "--resources-path", "./components" + ] + volumes: + - "./../docprocai_service/components/:/components" # Mount our components folder for the runtime to use. The mounted location must match the --resources-path argument. + depends_on: + - app-docprocai + - redis + network_mode: "service:app-docprocai" + redis: + image: "redis:alpine" + expose: + - "6379" +volumes: + dbdata: +networks: + default: + name: dapr-network + external: true \ No newline at end of file From c1d4f8b6ae193a5ad0c46b34ce74bd2b16d989f7 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Tue, 12 Aug 2025 00:24:02 +0200 Subject: [PATCH 11/13] config.yaml fixed. --- config.yaml | 51 +-------------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/config.yaml b/config.yaml index 02c6e77..dda2d49 100644 --- a/config.yaml +++ b/config.yaml @@ -1,6 +1,5 @@ # URL to the graphql endpoint of the media service. This is used to fetch information about media records the service # needs to process -#media_service_url: "http://localhost:3500/v1.0/invoke/docprocai-service/method/graphql" media_service_url: "http://app-media:3001/graphql" @@ -102,55 +101,7 @@ lecture_llm_generator: ```<|eot_id|><|start_header_id|>assistant<|end_header_id|> # Settings pertaining to the generation of summaries for documents - document_summary_generator:services: - database: - image: pgvector/pgvector:pg16 - command: -c 'max_connections=500' - restart: unless-stopped - expose: - - 5432 - ports: - - "5432:5432" - volumes: - - dbdata:/var/lib/postgresql/data - - ./../docprocai_service/pg-init-scripts:/docker-entrypoint-initdb.d - environment: - - POSTGRES_USER=root - - POSTGRES_PASSWORD=root - - POSTGRES_CREATE_DB_DOCPROCAI_SERVICE=docprocai_service - app-docprocai: - build: - context: ./../docprocai_service - dockerfile: Dockerfile - restart: unless-stopped - container_name: docprocai_service - volumes: - - "./../docprocai_service/llm_data:/app/llm_data" - ports: - - "9900:9900" - - "9901:9901" - depends_on: - - database - dapr-docprocai: - image: "daprio/daprd" - command: [ - "./daprd", - "--app-id", "docprocai_service", - "--app-port", "9901", - "--dapr-http-port", "9900", - "--resources-path", "./components" - ] - volumes: - - "./../docprocai_service/components/:/components" # Mount our components folder for the runtime to use. The mounted location must match the --resources-path argument. - depends_on: - - app-docprocai - - redis - network_mode: "service:app-docprocai" - redis: - image: "redis:alpine" - expose: - - "6379" - + document_summary_generator: # If true, llm features are enabled in the processing pipeline. If false, these steps are skipped in the pipeline # and relevant data attributes replaced by placeholders or left unset # Default: true From 2fd32c21a4f2bf404022fbd655f6a085b990bb67 Mon Sep 17 00:00:00 2001 From: pkunz96 <49315956+pkunz96@users.noreply.github.com> Date: Tue, 12 Aug 2025 00:45:37 +0200 Subject: [PATCH 12/13] Update README.md Removed obsolete documentation on the formerly required GPU support. --- README.md | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/README.md b/README.md index 5cc6020..3222dd6 100644 --- a/README.md +++ b/README.md @@ -12,36 +12,6 @@ This service is designed to process and manage uploaded lecture material (video For a deeper dive into the features and considerations made during development, check out our paper on *DocProcAI*. -## Installation -### Neural Network Models Installation -This service requires neural network models to function at all. These models need to be downloaded and placed into a `llm_data` folder in the root. This folder is then mounted in the docker container -automatically and the files inside can then be referenced as seen in the `config.yaml` - -> [!CAUTION] -> The service cannot run without at least a sentence embedding model installed! - -> [!TIP] -> The `segment_title_generator` and `document_summary_generator` tasks only require LLMs if these features are enabled in the `config.yaml`. They are enabled by default. - -### Recommended Neural Network Models - -* For the text embedding, we recommend [Alibaba-NLP/gte-large-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5) -* For the title and summary generation, we recommend [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) -* While the title generation should work with just a base model, we recommend our custom fine-tuned LoRA Adapter for better results in the title generation task. The adapter files may be provided to you upon request. - -### GPU Acceleration -This service requires pytorch to function. As pytorch GPU-support is required for some features of this service, the pip-distributed version of pytorch cannot be used and instead a -platform-specific version has to be used. -By default, pytorch for NVIDIA CUDA 12.4 is used, as this should provide the most capability for widespread GPUs. If you need to use a different version of pytorch, you can change -the install script located in the `Dockerfile`. - -> [!WARNING] -> Note that GPU features require a supported GPU and OS to function, especially in conjunction with Docker, as the service runs in a Docker container. -> -> Docker does not provide GPU-support for MacOS at this point in time, thus GPU-features of the service do not function on MacOS. -> -> GPU features can be disabled using the `config.yaml`. Additionally, it might be necessary to change the `docker-compose.yaml` file and remove the GPU device reservation. - ## Configuration The service uses the `config.yaml` file located in the root directory for configuration. For further information about configuration check out this file, all configuration properties are explained using in-file comments. From cd90369694ef9cb5e73e820238695107508ee310 Mon Sep 17 00:00:00 2001 From: Philipp Kunz Date: Fri, 15 Aug 2025 21:59:54 +0200 Subject: [PATCH 13/13] Made db_conf_str and media_service_url configurable through an environment variable. --- Dockerfile | 6 +++++- client/MediaServiceClient.py | 4 +++- config.yaml | 9 ++++++--- service/DocProcAiService.py | 4 +++- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 36fc051..417b173 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,4 +18,8 @@ RUN pip install --no-cache-dir -r requirements.txt # copy the current directory contents into the container at /app COPY . . -CMD ["python", "./app.py"] \ No newline at end of file +# This environment varialble must be set to connect this service with the media service. +ENV media_service_url="http://app-media:3001/graphql" +ENV connection_string="user=root password=root host=database port=5432 dbname=docprocai_service" + +CMD ["python", "./app.py"] diff --git a/client/MediaServiceClient.py b/client/MediaServiceClient.py index 91bb0ca..7e8b9a5 100644 --- a/client/MediaServiceClient.py +++ b/client/MediaServiceClient.py @@ -2,9 +2,11 @@ import config import gql import gql.transport.aiohttp +import os class MediaServiceClient: + async def get_media_record_type_and_download_url(self, document_id: uuid.UUID) -> dict: self.__init_client_if_not_already() @@ -46,5 +48,5 @@ async def get_media_record_ids_of_contents(self, content_ids: list[uuid.UUID]) - return media_records def __init_client_if_not_already(self): - transport = gql.transport.aiohttp.AIOHTTPTransport(url=config.current["media_service_url"]) + transport = gql.transport.aiohttp.AIOHTTPTransport(url=os.environ.get("media_service_url")) self.__client = gql.Client(transport=transport, fetch_schema_from_transport=True) \ No newline at end of file diff --git a/config.yaml b/config.yaml index dda2d49..cb263c9 100644 --- a/config.yaml +++ b/config.yaml @@ -1,13 +1,16 @@ # URL to the graphql endpoint of the media service. This is used to fetch information about media records the service # needs to process -media_service_url: "http://app-media:3001/graphql" +# The option was disabled as it is currently fetched from an environment variable. It will be removed in a future version. +#media_service_url: "http://app-media:3001/graphql" + +# The option was disabled as it is currently fetched from an environment variable. It will be removed in a future version. # Settings pertaining to the database connection of the service. Note that the service expects a postgresql database # with the pgvector extension installed -database: +#database: # Connection string to the database - connection_string: "user=root password=root host=database port=5432 dbname=docprocai_service" + #connection_string: "user=root password=root host=database port=5432 dbname=docprocai_service" # Settings pertaining to the transcript generation for videos transcript_generation: diff --git a/service/DocProcAiService.py b/service/DocProcAiService.py index 46a9ed7..579ebc4 100644 --- a/service/DocProcAiService.py +++ b/service/DocProcAiService.py @@ -5,6 +5,7 @@ import threading import time import uuid +import os from time import sleep from typing import Callable, Self, Awaitable, Optional @@ -38,9 +39,10 @@ class DocProcAiService: + def __init__(self): self.database_connection = psycopg.connect( - config.current["database"]["connection_string"], + os.environ.get("connection_string"), autocommit=True, row_factory=psycopg.rows.dict_row )