diff --git a/src/extract_metadata.py b/src/extract_metadata.py index 7dc5448f..faa85c6d 100644 --- a/src/extract_metadata.py +++ b/src/extract_metadata.py @@ -49,9 +49,19 @@ def extract_audio_metadata(file_path): def add_pymupdf_page_metadata(doc: Document, chunk_size: int = 1200, chunk_overlap: int = 600) -> List[Document]: """ - Splits and adds page metadata to each chunk of a pdf document. Relies on the custom implementation of pymupdfparser - Called by document_processor.py. - """ + Called by document_processor.py. Chunks the body of text returned by the custom pymupdfparser script. + Uses a helper method named `split_text` to assign the appropriate page metadata to each chunk. + + Detailed Process: + 1. The method first identifies the positions of the custom page markers within the text using a regular expression. + These markers denote the start of a new page (e.g., `[[page1]]`). + 2. The text is then cleaned by removing the page markers, resulting in a continuous block of text. + 3. The cleaned text is split into chunks based on the specified `chunk_size`. If the chunk size exceeds the + remaining length of the text, the last chunk is adjusted to include the remaining text. + 4. For each chunk, the method determines the appropriate page number by finding the nearest preceding page + marker position. + 5. The method returns a list of tuples where each tuple contains a chunk of text and the page number associated with that chunk. + """ def split_text(text: str, chunk_size: int, chunk_overlap: int) -> List[Tuple[str, int]]: page_markers = [(m.start(), int(m.group(1))) for m in re.finditer(r'\[\[page(\d+)\]\]', text)] clean_text = re.sub(r'\[\[page\d+\]\]', '', text) diff --git a/src/gui_tabs_settings_vision.py b/src/gui_tabs_settings_vision.py index 6758f650..4f078836 100644 --- a/src/gui_tabs_settings_vision.py +++ b/src/gui_tabs_settings_vision.py @@ -79,7 +79,7 @@ def populate_model_combobox(self): else: available_models.append(model) else: - available_models.append(model) # Add non-CUDA models even if CUDA is available + available_models.append(model) else: if not requires_cuda: available_models.append(model) diff --git a/src/module_process_images.py b/src/module_process_images.py index 2da368cc..65378a37 100644 --- a/src/module_process_images.py +++ b/src/module_process_images.py @@ -26,20 +26,6 @@ set_logging_level() -# warnings.filterwarnings("ignore", category=FutureWarning) -# warnings.filterwarnings("ignore", category=UserWarning) -# warnings.filterwarnings("ignore", category=DeprecationWarning) -# warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*") - -# datasets_logger = logging.getLogger('datasets') -# datasets_logger.setLevel(logging.WARNING) -# logging.getLogger("transformers").setLevel(logging.CRITICAL) -# logging.getLogger("transformers").setLevel(logging.ERROR) -# logging.getLogger("transformers").setLevel(logging.WARNING) -# logging.getLogger("transformers").setLevel(logging.INFO) -# logging.getLogger("transformers").setLevel(logging.DEBUG) -# logging.getLogger().setLevel(logging.WARNING) - ALLOWED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff'] current_directory = Path(__file__).parent @@ -202,7 +188,6 @@ def process_single_image(self, raw_image): model_response = full_response.split("ASSISTANT: ")[-1] return model_response - class loader_llava_next(BaseLoader): def initialize_model_and_tokenizer(self): chosen_model = self.config['vision']['chosen_model'] @@ -247,7 +232,6 @@ def process_single_image(self, raw_image): return model_response - class loader_falcon(BaseLoader): def initialize_model_and_tokenizer(self): chosen_model = self.config['vision']['chosen_model'] @@ -302,8 +286,6 @@ def process_single_image(self, raw_image): return model_response - - class loader_moondream(BaseLoader): def initialize_model_and_tokenizer(self): chosen_model = self.config['vision']['chosen_model'] @@ -329,7 +311,6 @@ def process_single_image(self, raw_image): summary = self.model.answer_question(enc_image, "Describe what this image depicts in as much detail as possible.", self.tokenizer) return summary - class loader_florence2(BaseLoader): def __init__(self, config): super().__init__(config) @@ -392,7 +373,6 @@ def process_single_image(self, raw_image): return parsed_answer[''] - class loader_phi3vision(BaseLoader): def initialize_model_and_tokenizer(self): chosen_model = self.config['vision']['chosen_model'] @@ -465,7 +445,6 @@ def process_single_image(self, raw_image): return response - class loader_minicpm_V_2_6(BaseLoader): def initialize_model_and_tokenizer(self): chosen_model = self.config['vision']['chosen_model'] diff --git a/src/module_transcribe.py b/src/module_transcribe.py index 209a25f4..925023af 100644 --- a/src/module_transcribe.py +++ b/src/module_transcribe.py @@ -112,23 +112,22 @@ def convert_to_wav(self, audio_file): output_file = f"{Path(audio_file).stem}_converted.wav" output_path = Path(__file__).parent / output_file - with av.open(audio_file) as container: - stream = next(s for s in container.streams if s.type == 'audio') - - resampler = av.AudioResampler( - format='s16', - layout='mono', - rate=16000, - ) + with av.open(audio_file) as input_container: + input_stream = input_container.streams.audio[0] output_container = av.open(str(output_path), mode='w') output_stream = output_container.add_stream('pcm_s16le', rate=16000) - output_stream.layout = 'mono' + output_stream.channels = 1 + + resampler = av.AudioResampler(format='s16', layout='mono', rate=16000) + + # Determine optimal chunk size (adjust as needed) + chunk_size = 1024 * 32 # 32KB chunks - for frame in container.decode(audio=0): + for frame in input_container.decode(audio=0): frame.pts = None resampled_frames = resampler.resample(frame) - if resampled_frames is not None: + if resampled_frames: for resampled_frame in resampled_frames: for packet in output_stream.encode(resampled_frame): output_container.mux(packet) diff --git a/src/setup_windows.py b/src/setup_windows.py index 0d650313..db9e1cef 100644 --- a/src/setup_windows.py +++ b/src/setup_windows.py @@ -253,7 +253,6 @@ def install_libraries(libraries): "protobuf==5.27.2", "psutil==6.0.0", "pyarrow==17.0.0", - "pyarrow-hotfix==0.6", "pycparser==2.22", "pydantic==2.7.4", "pydantic_core==2.18.4", @@ -317,10 +316,8 @@ def install_libraries(libraries): "zipp==3.19.2" ] -# matplotlib==3.9.2 -# pyparsing==3.1.2 -# cycler==0.12.1 -# kiwisolver==1.4.5 +# pip install matplotlib==3.9.2 pyparsing==3.1.2 cycler==0.12.1 kiwisolver==1.4.5 --no-deps +# matplotlib will still show conflicts re missing libraries, but these are not needed to run my specific plots full_install_libraries = [ "pyside6==6.7.2",