MEITREX · pkunz96 · Aug 15, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -3,7 +3,7 @@ name: Building Docker Image
 on:
   push:
     branches:
-      - 'temp-deployment'
+      - 'remove_old'
 
 jobs:
   docker-build:
@@ -22,7 +22,7 @@ jobs:
       - name: Build docker container
         uses: docker/build-push-action@v5
         with:
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+        #  cache-from: type=gha
+        #  cache-to: type=gha,mode=max
           tags: ghcr.io/meitrex/docprocai_service:latest
           push: true
diff --git a/Dockerfile b/Dockerfile
@@ -15,11 +15,11 @@ COPY requirements.txt .
 # install dependencies using pip
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Install torch manually, it needs a special version which supports our CUDA version. Install this AFTER the
-# requirements, as some of the requirements might install torch as well, but we want to install the correct version
-RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
-
 # copy the current directory contents into the container at /app
 COPY . .
 
-CMD ["python", "./app.py"]
+# This environment varialble must be set to connect this service with the media service. 
+ENV media_service_url="http://app-media:3001/graphql"
+ENV connection_string="user=root password=root host=database port=5432 dbname=docprocai_service"
+
+CMD ["python", "./app.py"]
diff --git a/README.md b/README.md
@@ -12,36 +12,6 @@ This service is designed to process and manage uploaded lecture material (video
 
 For a deeper dive into the features and considerations made during development, check out our paper on *DocProcAI*.
 
-## Installation
-### Neural Network Models Installation
-This service requires neural network models to function at all. These models need to be downloaded and placed into a `llm_data` folder in the root. This folder is then mounted in the docker container
-automatically and the files inside can then be referenced as seen in the `config.yaml`
-
-> [!CAUTION]
-> The service cannot run without at least a sentence embedding model installed!
-
-> [!TIP]
-> The `segment_title_generator` and `document_summary_generator` tasks only require LLMs if these features are enabled in the `config.yaml`. They are enabled by default.
-
-### Recommended Neural Network Models
-
-* For the text embedding, we recommend [Alibaba-NLP/gte-large-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5)
-* For the title and summary generation, we recommend [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
-* While the title generation should work with just a base model, we recommend our custom fine-tuned LoRA Adapter for better results in the title generation task. The adapter files may be provided to you upon request.
-
-### GPU Acceleration
-This service requires pytorch to function. As pytorch GPU-support is required for some features of this service, the pip-distributed version of pytorch cannot be used and instead a
-platform-specific version has to be used.
-By default, pytorch for NVIDIA CUDA 12.4 is used, as this should provide the most capability for widespread GPUs. If you need to use a different version of pytorch, you can change
-the install script located in the `Dockerfile`.
-
-> [!WARNING]
-> Note that GPU features require a supported GPU and OS to function, especially in conjunction with Docker, as the service runs in a Docker container.
-> 
-> Docker does not provide GPU-support for MacOS at this point in time, thus GPU-features of the service do not function on MacOS.
->
->  GPU features can be disabled using the `config.yaml`. Additionally, it might be necessary to change the `docker-compose.yaml` file and remove the GPU device reservation.
-
 ## Configuration
 The service uses the `config.yaml` file located in the root directory for configuration.
 For further information about configuration check out this file, all configuration properties are explained using in-file comments.

diff --git a/client/MediaServiceClient.py b/client/MediaServiceClient.py
@@ -2,9 +2,11 @@
 import config
 import gql
 import gql.transport.aiohttp
+import os
 
 
 class MediaServiceClient:
+
     async def get_media_record_type_and_download_url(self, document_id: uuid.UUID) -> dict:
         self.__init_client_if_not_already()
 
@@ -46,5 +48,5 @@ async def get_media_record_ids_of_contents(self, content_ids: list[uuid.UUID]) -
         return media_records
 
     def __init_client_if_not_already(self):
-        transport = gql.transport.aiohttp.AIOHTTPTransport(url=config.current["media_service_url"])
+        transport = gql.transport.aiohttp.AIOHTTPTransport(url=os.environ.get("media_service_url"))
         self.__client = gql.Client(transport=transport, fetch_schema_from_transport=True)
diff --git a/config.yaml b/config.yaml
@@ -1,12 +1,16 @@
 # URL to the graphql endpoint of the media service. This is used to fetch information about media records the service
 # needs to process
-media_service_url: "http://app-media:3001/graphql"
 
+# The option was disabled as it is currently fetched from an environment variable. It will be removed in a future version.
+#media_service_url: "http://app-media:3001/graphql"
+
+
+# The option was disabled as it is currently fetched from an environment variable. It will be removed in a future version.
 # Settings pertaining to the database connection of the service. Note that the service expects a postgresql database
 # with the pgvector extension installed
-database:
+#database:
   # Connection string to the database
-  connection_string: "user=root password=root host=database port=5432 dbname=docprocai_service"
+  #connection_string: "user=root password=root host=database port=5432 dbname=docprocai_service"
 
 # Settings pertaining to the transcript generation for videos
 transcript_generation:
@@ -31,7 +35,8 @@ text_embedding:
   protocol: "http"
   hostname: "129.69.217.248"
   port: 11435
-
+  dimensionality: 1024
+  model_path: "/"
 
 # Settings pertaining to the content linking step during the content processing pipeline. Content linking is the process
 # of analyzing all documents and videos of a content object and checking which parts of the documents should be linked,
@@ -130,4 +135,4 @@ lecture_llm_generator:
 
       I have some text extracted from the document of a lecture. Please create a very compact overview/table of contents about which topics are covered in this lecture. Your response must be under 800 characters in length.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 
-      Table of contents of the covered topics:
+      Table of contents of the covered topics:
diff --git a/controller/events.py b/controller/events.py
@@ -1,4 +1,4 @@
-import uuid
+from uuid import UUID
 from dataclasses import dataclass
 from enum import Enum, auto
 
@@ -10,5 +10,5 @@ class CrudOperation(Enum):
 
 @dataclass
 class ContentChangeEvent:
-    contentIds: list[uuid]
+    contentIds: list[UUID]
     crudOperation: CrudOperation
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -27,13 +27,6 @@ services:
       - "9901:9901"
     depends_on:
       - database
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: all
-              capabilities: [ gpu ]
   dapr-docprocai:
     image: "daprio/daprd"
     command: [
@@ -58,4 +51,4 @@ volumes:
 networks:
   default:
       name: dapr-network
-      external: true
+      external: true
diff --git a/docker-compose.yml.backup b/docker-compose.yml.backup
@@ -0,0 +1,61 @@
+services:
+  database:
+    image: pgvector/pgvector:pg16
+    command: -c 'max_connections=500'
+    restart: unless-stopped
+    expose:
+      - 5432
+    ports:
+      - "5432:5432"
+    volumes:
+      - dbdata:/var/lib/postgresql/data
+      - ./../docprocai_service/pg-init-scripts:/docker-entrypoint-initdb.d
+    environment:
+      - POSTGRES_USER=root
+      - POSTGRES_PASSWORD=root
+      - POSTGRES_CREATE_DB_DOCPROCAI_SERVICE=docprocai_service
+  app-docprocai:
+    build:
+      context: ./../docprocai_service
+      dockerfile: Dockerfile
+    restart: unless-stopped
+    container_name: docprocai_service
+    volumes:
+      - "./../docprocai_service/llm_data:/app/llm_data"
+    ports:
+      - "9900:9900"
+      - "9901:9901"
+    depends_on:
+      - database
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]
+  dapr-docprocai:
+    image: "daprio/daprd"
+    command: [
+      "./daprd",
+      "--app-id", "docprocai_service",
+      "--app-port", "9901",
+      "--dapr-http-port", "9900",
+      "--resources-path", "./components"
+    ]
+    volumes:
+      - "./../docprocai_service/components/:/components" # Mount our components folder for the runtime to use. The mounted location must match the --resources-path argument.
+    depends_on:
+      - app-docprocai
+      - redis
+    network_mode: "service:app-docprocai"
+  redis:
+    image: "redis:alpine"
+    expose:
+      - "6379"
+volumes:
+  dbdata:
+networks:
+  default:
+      name: dapr-network
+      external: true
diff --git a/fileextractlib/DocumentData.py b/fileextractlib/DocumentData.py
@@ -1,10 +1,10 @@
 from PIL.Image import Image
-from torch import Tensor
 from typing import Optional
-
+from numpy.typing import NDArray
+import numpy as np
 
 class PageData:
-    def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[Tensor]):
+    def __init__(self, page_number: int, text: str, thumbnail: Image, embedding: Optional[NDArray[np.float64]]):
         self.page_number = page_number
         self.text = text
         self.thumbnail = thumbnail

diff --git a/fileextractlib/LectureLlmGenerator.py b/fileextractlib/LectureLlmGenerator.py
@@ -2,38 +2,21 @@
 import json
 from collections import OrderedDict
 import time
-from typing import Optional
+from typing import Optional, Any
 
 import pydantic
-import torch.cuda
 import config
 from fileextractlib.DocumentData import DocumentData
-from fileextractlib.LLMService import LlamaRunner, LLMProfile, SEGMENT_TITLE_GENERATOR_PROFILE, Hyperparameter
+from fileextractlib.LLMService import LLMProfile, SEGMENT_TITLE_GENERATOR_PROFILE, Hyperparameter
 from fileextractlib.VideoData import VideoData
 import logging
 
-from LLMService import DefaultLLMService, DOCUMENT_SUMMARY_GENERATOR_PROFILE
+
+from fileextractlib.LLMService import DefaultLLMService, DOCUMENT_SUMMARY_GENERATOR_PROFILE
 
 _logger = logging.getLogger(__name__)
 
 class LectureLlmGenerator:
-    def __init__(self):
-        if config.current["lecture_llm_generator"]["keep_models_loaded"]:
-            # if the config says that both llm generators shall use the same base & lora models, then we don't need to
-            # load both but can instead just use the same runner for both tasks
-            if ((config.current["lecture_llm_generator"]["segment_title_generator"]["base_model_path"] ==
-                config.current["lecture_llm_generator"]["document_summary_generator"]["base_model_path"]) and
-                (config.current["lecture_llm_generator"]["segment_title_generator"]["lora_model_path"] ==
-                 config.current["lecture_llm_generator"]["document_summary_generator"]["lora_model_path"])):
-                # our unified runner for both tasks, just use the title runner, both are identical anyway
-                unified_runner = LectureLlmGenerator.__load_title_llama_runner()
-                self.__summarization_llama_runner = unified_runner
-                self.__title_llama_runner = unified_runner
-            else:
-                # otherwise, we'll have to load them separately
-                self.__summarization_llama_runner = LectureLlmGenerator.__load_summarization_llama_runner()
-                self.__title_llama_runner = LectureLlmGenerator.__load_title_llama_runner()
-
     def generate_titles_for_video(self, video_data: VideoData) -> None:
         """
         Uses an LLM to generate appropriate titles for the segments of the passed videos. Modifies the title field in
@@ -144,7 +127,7 @@ def generate_summary_for_document(self, document_data: DocumentData) -> None:
         document_data.summary = [answer_text]
 
     @staticmethod
-    def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema: dict[str, any], profile: Optional[LLMProfile], hyperparameter: Optional[Hyperparameter]) -> any:
+    def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema: dict[str, Any], profile: Optional[LLMProfile], hyperparameter: Optional[Hyperparameter]) -> Any:
 
         generated_text = llm_service.run_custom(prompt, json.dumps(answer_schema), profile, hyperparameter)
 
@@ -156,16 +139,4 @@ def __generate_answer_json(llm_service: DefaultLLMService, prompt, answer_schema
             answer_json = json.loads(generated_text)
             return answer_json
         except ValueError as e:
-            _logger.exception("Error while parsing LLM answer json.", exc_info=e)
-
-    @staticmethod
-    def __load_summarization_llama_runner() -> LlamaRunner:
-        return LlamaRunner(
-            config.current["lecture_llm_generator"]["document_summary_generator"]["base_model_path"],
-            config.current["lecture_llm_generator"]["document_summary_generator"]["lora_model_path"])
-
-    @staticmethod
-    def __load_title_llama_runner() -> LlamaRunner:
-        return LlamaRunner(
-            config.current["lecture_llm_generator"]["segment_title_generator"]["base_model_path"],
-            config.current["lecture_llm_generator"]["segment_title_generator"]["lora_model_path"])
+            _logger.exception("Error while parsing LLM answer json.", exc_info=e)
diff --git a/fileextractlib/LectureVideoEmbeddingGenerator.py b/fileextractlib/LectureVideoEmbeddingGenerator.py
@@ -2,7 +2,6 @@
 
 import time
 from fileextractlib.SentenceEmbeddingRunner import SentenceEmbeddingRunner
-from fileextractlib.VideoProcessor import VideoProcessor
 from fileextractlib.VideoData import VideoSegmentData
 
 class LectureVideoEmbeddingGenerator:

diff --git a/fileextractlib/SentenceEmbeddingRunner.py b/fileextractlib/SentenceEmbeddingRunner.py
@@ -1,10 +1,9 @@
 from typing import List
 
-import yaml
-import urllib
+import urllib.parse
 import requests
 import numpy as np
-from numpy._typing import NDArray
+from numpy.typing import NDArray
 import config
 
 
@@ -20,8 +19,7 @@ def _create_url(self, words: List[str]) -> str:
         query_string = urllib.parse.urlencode(query_params)
         return f"{self.protocol}://{self.hostname}:{self.port}/embed?{query_string}"
 
-    def generate_embeddings(self, words: List[str]) -> NDArray[float]:
-
+    def generate_embeddings(self, words: List[str]) -> NDArray[np.float64]:
         """
         This method accepts a list of strings and computes for each its respective embedding vector.
         :param words: a list of words for which the respective embeddings shall be computed.

diff --git a/fileextractlib/VideoData.py b/fileextractlib/VideoData.py
@@ -1,7 +1,8 @@
 import PIL.Image
-from torch import Tensor
 from webvtt import WebVTT
 from typing import Optional
+from numpy.typing import NDArray
+import numpy as np
 
 class VideoSegmentData:
     """
@@ -15,21 +16,21 @@ def __init__(self,
                  screen_text: str,
                  thumbnail: PIL.Image.Image,
                  title: Optional[str],
-                 embedding: Optional[Tensor]):
+                 embedding: Optional[NDArray[np.float64]]):
         self.start_time: int = start_time
         self.transcript: str = transcript
         self.screen_text: str = screen_text
         self.thumbnail: PIL.Image.Image = thumbnail
         self.title = title
-        self.embedding: Tensor = embedding
+        self.embedding: Optional[NDArray[np.float64]] = embedding
 
 
 class VideoData:
     """
     Represents a video's data, containing the captions and the sections of the video.
     """
 
-    def __init__(self, vtt: WebVTT, segments: list[VideoSegmentData], summary: list[str] = None):
+    def __init__(self, vtt: WebVTT, segments: list[VideoSegmentData], summary: Optional[list[str]] = None):
         if summary is None:
             summary = []
         self.vtt: WebVTT = vtt