Código fuente para pyexamgenerator.question_generator

# pyexamgenerator: A tool for generating exams from PDF files using AI.
# Copyright (C) 2024 Daniel Sánchez-García

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import google.generativeai as genai
import PyPDF2
import json
import re
import os
import pandas as pd
import time
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
from typing import List, Dict, Optional

[documentos] class QuotaExceededError(Exception): """Excepción para cuando se excede el límite de cuota de la API de Gemini.""" pass
[documentos] class QuestionGenerator: """ Generates multiple-choice questions from PDF files using the Gemini model. """ def __init__(self, api_key: str, model_name: str): """ Initializes the QuestionGenerator class. Args: api_key (str): The API key for the Gemini model. model_name (str): The name of the Gemini model to use (e.g., 'models/gemini-1.5-pro'). This is a required argument. """ genai.configure(api_key=api_key) self.model = genai.GenerativeModel(model_name) print(f"Modelo de Gemini inicializado: {model_name}") self.prompt_types = { "PRL": """ Genera preguntas tipo test de opción múltiple enfocadas en Prevención de Riesgos Laborales (PRL), basadas en el siguiente texto, siguiendo estas instrucciones iniciales: 1. **Centrarse en leyes y reglamentos:** Las preguntas deben poder entenderse por sí mismas, haciendo referencia únicamente a leyes o reglamentos relevantes. No se puede poner: "Según el texto" 2. **Evita preguntas sobre números de artículos o códigos:** No incluyas preguntas que hagan referencia a números de artículos, códigos de leyes o reglamentos. Además, ten en cuenta las siguientes indicaciones adicionales """, "PM": """ Genera preguntas tipo test de opción múltiple enfocadas en Gestión de Proyectos. """ } self.existing_bank_df = None self.similarity_threshold = 0.8 # Default threshold self.current_prompt_example_content_type = "solo_enunciados" # Default value
[documentos] def extract_pdf_text(self, pdf_path: str) -> Optional[str]: """ Extracts text from a PDF file. Args: pdf_path (str): The path to the PDF file. Returns: Optional[str]: The extracted text from the PDF, or None if an error occurs. """ try: with open(pdf_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) self.text = "" self.page_texts = [] # List to store the text of each page for page in pdf_reader.pages: page_text = page.extract_text() self.page_texts.append(page_text) self.text += page_text self.num_pages = len(pdf_reader.pages) # Save the total number of pages return self.text except Exception as e: print(f"Error extracting text from PDF: {e}") return None
[documentos] def extract_topic_number_from_path(self, pdf_path: str) -> str: """ Extracts the topic name from the PDF filename. Args: pdf_path (str): The path to the PDF file. Returns: str: The extracted topic name, or "Tema desconocido" if not found. """ file_name = os.path.basename(pdf_path) # Using topic match: looking for a pattern like TEMA XX # topic_match = re.search(r'TEMA\s*(\d+)', file_name, re.IGNORECASE) # self.topic_number = topic_match.group(1) if topic_match else file_name.split('.pdf')[0] # Using the PDF name directly self.topic_number = file_name.split('.pdf')[0] return self.topic_number
def _group_pages_into_chunks(self, pages_per_chunk: int) -> List[str]: """ Groups the text from pages into larger chunks. Args: pages_per_chunk (int): The number of pages to include in each chunk. Returns: List[str]: A list of strings, where each string is a text chunk. """ if not hasattr(self, 'page_texts') or not self.page_texts: return [] text_chunks = [] current_chunk_pages = [] for i, page_text in enumerate(self.page_texts): current_chunk_pages.append(page_text) if (i + 1) % pages_per_chunk == 0 or (i + 1) == len(self.page_texts): text_chunks.append("\n".join(current_chunk_pages)) current_chunk_pages = [] return text_chunks def _build_prompt( self, working_chunk_text: str, num_questions_to_generate: int, prompt_type: str, custom_prompt: Optional[str], existing_questions_df: Optional[pd.DataFrame], current_topic: str, full_document_context_text: Optional[str] = None, bank_prompt_scope: Optional[str] = None, questions_from_current_chunk_attempts: Optional[List[Dict]] = None ) -> str: """ Constructs the complete prompt to be sent to the Gemini model. Args: working_chunk_text (str): The specific text segment to generate questions from. num_questions_to_generate (int): The target number of questions to generate. prompt_type (str): The key for the predefined prompt type (e.g., "PRL"). custom_prompt (Optional[str]): A user-provided custom prompt. existing_questions_df (Optional[pd.DataFrame]): DataFrame of existing questions for guidance. current_topic (str): The topic of the current PDF. full_document_context_text (Optional[str]): The full text of the document for context. bank_prompt_scope (Optional[str]): Scope for selecting guidance questions ('mismo_tema', 'todos_los_temas'). questions_from_current_chunk_attempts (Optional[List[Dict]]): Questions already generated for this chunk. Returns: str: The fully constructed prompt string. """ user_prefix = "" if custom_prompt: user_prefix = custom_prompt elif prompt_type in self.prompt_types: user_prefix = self.prompt_types[prompt_type] else: user_prefix = """ Genera preguntas tipo test de opción múltiple basadas en el siguiente texto, siguiendo estas instrucciones: """ fixed_prefix = f""" Genera {num_questions_to_generate} preguntas tipo test de opción múltiple con una única respuesta correcta basadas en el texto que adjunto al final, siguiendo estas instrucciones: """ # --- Guidance from the existing question bank --- prompt_guidance_from_bank_str = "" if existing_questions_df is not None and \ not existing_questions_df.empty and \ bank_prompt_scope and bank_prompt_scope != "no_incluir_en_prompt": questions_for_guidance_df = pd.DataFrame() header_for_bank_guidance = "" if bank_prompt_scope == "mismo_tema": if 'Tema' in existing_questions_df.columns: questions_for_guidance_df = existing_questions_df[ existing_questions_df['Tema'].astype(str).str.lower() == current_topic.lower() ] if not questions_for_guidance_df.empty: header_for_bank_guidance = "\n\nPara evitar redundancia con el BANCO DE PREGUNTAS EXISTENTE, considera las siguientes preguntas del MISMO TEMA (asegúrate de que las nuevas preguntas sean originales y basadas estrictamente en el FRAGMENTO DE TRABAJO proporcionado):\n" elif bank_prompt_scope == "todos_los_temas": questions_for_guidance_df = existing_questions_df if not questions_for_guidance_df.empty: header_for_bank_guidance = "\n\nPara evitar redundancia con el BANCO DE PREGUNTAS EXISTENTE, considera las siguientes preguntas de TODO EL BANCO (asegúrate de que las nuevas preguntas sean originales y basadas estrictamente en el TEXTO COMPLETO proporcionado):\n" if not questions_for_guidance_df.empty: prompt_guidance_from_bank_str += header_for_bank_guidance for _, row in questions_for_guidance_df.iterrows(): question_text = row.get('Pregunta', '') prompt_guidance_from_bank_str += f"- Pregunta del banco: {question_text}\n" if self.current_prompt_example_content_type == "enunciados_y_respuestas": options = [row.get(f'Respuesta {chr(65 + i)}', '') for i in range(4) if pd.notna(row.get(f'Respuesta {chr(65 + i)}', ''))] options_str = "\n".join([f" - {opt}" for opt in options if opt]) if options_str: prompt_guidance_from_bank_str += f"{options_str}\n" prompt_guidance_from_bank_str += "\n" # --- Guidance from questions already generated for this chunk in previous attempts --- prompt_guidance_from_current_attempts_str = "" if questions_from_current_chunk_attempts: # This is a list of dictionaries prompt_guidance_from_current_attempts_str += "\n\nAdicionalmente, para evitar generar preguntas idénticas o muy similares a las YA GENERADAS PARA ESTE MISMO FRAGMENTO en intentos previos, ten en cuenta las siguientes preguntas que ya se han aceptado:\n" for question_dict in questions_from_current_chunk_attempts: question_text = question_dict.get('Pregunta', '') if question_text: prompt_guidance_from_current_attempts_str += f"- Pregunta ya generada para este fragmento: {question_text}\n" # Optionally, you could also include the options if you find it useful # if self.current_prompt_example_content_type == "enunciados_y_respuestas": # options = [question_dict.get(f'Respuesta {chr(65 + i)}', '') for i in range(4)] # options_str = "\n".join([f" - {opt}" for opt in options if opt]) # if options_str: # prompt_guidance_from_current_attempts_str += f"{options_str}\n" prompt_guidance_from_current_attempts_str += "\n" context_section = "" chunk_instruction_and_text_section = "" if full_document_context_text is not None: context_section = f""" Aquí está el **CONTEXTO COMPLETO** del documento, que te proporcionamos para que tengas una comprensión general del tema y el documento en su totalidad. Puedes referenciar este texto para obtener contexto adicional, pero las preguntas deben basarse estrictamente en el **FRAGMENTO DE TRABAJO**. {full_document_context_text} """ chunk_instruction_and_text_section = f""" --- **IMPORTANTE**: Debes generar las preguntas **ÚNICAMENTE** basándote en el siguiente "**FRAGMENTO DE TRABAJO**". La respuesta a cada pregunta debe ser directamente derivable del contenido de este FRAGMENTO DE TRABAJO. --- Aquí está el FRAGMENTO DE TRABAJO sobre el que debes generar las preguntas: {working_chunk_text} """ else: chunk_instruction_and_text_section = f""" Aquí está el **TEXTO COMPLETO** sobre el que debes generar las preguntas: {working_chunk_text} """ prompt = f""" {user_prefix} {fixed_prefix} 1. **Incluye la respuesta correcta:** Para cada pregunta, indica cuál de las opciones es la respuesta correcta (solo la letra, sin paréntesis). 2. **Proporciona el texto relevante:** Incluye un fragmento del texto del PDF donde se encuentra la justificación de la respuesta correcta. 3. **Formato de diccionario:** Presenta las preguntas y respuestas en formato de diccionario de Python, donde cada diccionario representa una pregunta y tiene las siguientes claves: * "Pregunta" * "Opciones" (una lista de las opciones A, B, C, D, sin la letra inicial y el paréntesis) * "Respuesta correcta" (la letra de la opción correcta, sin paréntesis) * "Texto relevante" (el texto relevante del PDF) Separa cada pregunta con una línea en blanco. Puedes incluir preguntas del tipo: señala la afirmación incorrecta entre las siguientes opciones; o señala la opción correcta entre las siguientes. Además, puedes incluir respuestas como todas son correctas, o todas son incorrectas. {prompt_guidance_from_bank_str} {prompt_guidance_from_current_attempts_str} {context_section} {chunk_instruction_and_text_section} """ return prompt def _analyze_gemini_response(self, response_text: str, topic_number: str) -> List[Dict]: """ Analyzes the model's response, filters it by similarity, and returns a list of dictionaries with accepted questions. Rejected questions are added to the instance's `self.rejected_questions_list`. Args: response_text (str): The raw text response from the Gemini model. topic_number (str): The topic associated with these questions. Returns: List[Dict]: A list of accepted question dictionaries. """ accepted_questions = [] # The model is asked to separate question dictionaries with a blank line. question_strs = response_text.split('\n\n') for question_str in question_strs: cleaned_str = question_str.strip() # Attempt to find a Python dictionary-like structure within the string. match = re.search(r'\{(.*?)\}', cleaned_str, re.DOTALL) if not match: continue json_like_str = "{" + match.group(1) + "}" try: # Use eval() to parse the string as a Python dictionary. question_data = eval(json_like_str) if not isinstance(question_data, dict) or "Pregunta" not in question_data: continue # Check for similarity against the existing bank. is_similar, sim_score, matched_text = self._is_similar_question(question_data["Pregunta"]) # Build the base dictionary for the generated question. question_dict = { "Tema": f"{topic_number}", "Estado": "Pendiente de revisar", "Pregunta": question_data.get("Pregunta", ""), "Respuesta A": question_data.get("Opciones", [""] * 4)[0], "Respuesta B": question_data.get("Opciones", [""] * 4)[1], "Respuesta C": question_data.get("Opciones", [""] * 4)[2], "Respuesta D": question_data.get("Opciones", [""] * 4)[3], "Respuesta correcta": question_data.get("Respuesta correcta", "").strip().upper(), "Texto relevante": question_data.get("Texto relevante", "") } if is_similar: # If similar, add it to the rejected list with extra details for reporting. question_dict['Similitud'] = sim_score question_dict['Pregunta Coincidente'] = matched_text self.rejected_questions_list.append(question_dict) else: # If not similar, add it to the accepted list for this batch. accepted_questions.append(question_dict) except (SyntaxError, TypeError, NameError): print(f"Error al evaluar la estructura como diccionario: {json_like_str}") return accepted_questions
[documentos] def save_questions_to_excel( self, df: pd.DataFrame, filepath: str ) -> None: """Saves the questions to an Excel file.""" if not df.empty: if 'Número de pregunta' not in df.columns: df.insert(0, 'Número de pregunta', range(1, len(df) + 1)) df.to_excel(filepath, index=False) print(f"Preguntas aceptadas guardadas en: '{filepath}'") else: print("No se generaron preguntas aceptadas para guardar en Excel.")
[documentos] def save_questions_to_docx( self, df: pd.DataFrame, filepath: str ) -> None: """Saves the questions to a DOCX file for manual review.""" if not df.empty: document = Document() document.add_heading('Banco de Preguntas - Pendiente de Revisar', level=1) for index, row in df.iterrows(): pregunta_paragraph = document.add_paragraph() pregunta_run = pregunta_paragraph.add_run(f"Pregunta {index + 1}: {row['Pregunta']}") pregunta_run.bold = True document.add_paragraph(f"Tema: {row['Tema']}") document.add_paragraph(f"Estado: {row['Estado']}") document.add_paragraph(f"A) {row['Respuesta A']}") document.add_paragraph(f"B) {row['Respuesta B']}") document.add_paragraph(f"C) {row['Respuesta C']}") document.add_paragraph(f"D) {row['Respuesta D']}") document.add_paragraph(f"Respuesta correcta: {row['Respuesta correcta']}") document.add_paragraph(f"Texto relevante: {row['Texto relevante']}") document.add_paragraph() document.save(filepath) else: print("No hay preguntas para guardar en DOCX.")
[documentos] def save_rejected_questions_to_docx(self, df: pd.DataFrame, filepath: str) -> None: """ Saves questions discarded due to similarity to a DOCX file for inspection. Args: df (pd.DataFrame): The DataFrame containing the discarded questions and similarity details. filepath (str): The filepath for the output file. """ if df.empty: print("No se descartaron preguntas por similitud.") return document = Document() document.add_heading('Preguntas Descartadas por Similitud', level=1) document.add_paragraph( "Las siguientes preguntas fueron generadas por el modelo, pero se descartaron " "automáticamente porque su similitud con una pregunta existente en el banco " "superó el umbral establecido." ) for index, row in df.iterrows(): # Visual separator between questions document.add_paragraph("---") # Information about the discarded question p = document.add_paragraph() p.add_run('Pregunta descartada: ').bold = True p.add_run(f"{row['Pregunta']}") document.add_paragraph(f"Tema: {row['Tema']}") document.add_paragraph(f"A) {row['Respuesta A']}") document.add_paragraph(f"B) {row['Respuesta B']}") document.add_paragraph(f"C) {row['Respuesta C']}") document.add_paragraph(f"D) {row['Respuesta D']}") document.add_paragraph(f"Respuesta correcta: {row['Respuesta correcta']}") # Information about the reason for rejection p_reason = document.add_paragraph() p_reason.add_run('Motivo del descarte:').bold = True # Format the similarity score as a percentage similarity_score_pct = f"{row.get('Similitud', 0.0) * 100:.1f}%" p_details = document.add_paragraph() p_details.add_run(f"Similitud: ").bold = True p_details.add_run(f"{similarity_score_pct}") p_match = document.add_paragraph() p_match.add_run("Coincide con la pregunta existente: ").bold = True p_match.add_run(f"{row.get('Pregunta Coincidente', 'N/A')}") document.save(filepath) print(f"Se guardó un informe de {len(df)} preguntas descartadas en: '{filepath}'")
def _normalize_text(self, text: str) -> str: """Normalizes text for comparison (lowercase, no punctuation).""" if not isinstance(text, str): return "" text = text.lower() text = re.sub(r'[^\w\s]', '', text) # Remove punctuation return text def _calculate_jaccard_similarity(self, text1: str, text2: str) -> float: """Calculates the Jaccard similarity between two texts.""" words1 = set(self._normalize_text(text1).split()) words2 = set(self._normalize_text(text2).split()) if not words1 and not words2: return 1.0 if not words1 or not words2: return 0.0 intersection = len(words1.intersection(words2)) union = len(words1.union(words2)) return intersection / union def _is_similar_question(self, new_question_text: str ) -> tuple[bool, float, Optional[str]]: """ Checks if the new question is similar to any existing one in the bank, comparing ONLY the question statements for filtering purposes. Args: new_question_text (str): The text of the new question. Returns: tuple[bool, float, Optional[str]]: (is_similar, similarity_score, matching_question_text) """ if self.existing_bank_df is None or self.existing_bank_df.empty or self.similarity_threshold > 1.0: # If the threshold is > 1.0 (e.g., 1.1), the filter is effectively disabled return (False, 0.0, None) normalized_new_question_text = self._normalize_text(new_question_text) for _, row in self.existing_bank_df.iterrows(): existing_question_text_from_bank = row.get('Pregunta') if existing_question_text_from_bank and isinstance(existing_question_text_from_bank, str): normalized_existing_question_text = self._normalize_text(existing_question_text_from_bank) question_similarity = self._calculate_jaccard_similarity( normalized_new_question_text, normalized_existing_question_text ) if question_similarity >= self.similarity_threshold: # For filtering, the similarity of the statement is sufficient return (True, question_similarity, existing_question_text_from_bank) return (False, 0.0, None)
[documentos] def generate_multiple_choice_questions( self, pdf_paths: List[str], prompt_type: str, num_questions_per_chunk_target: int = 5, output_filename: Optional[str] = None, output_dir: Optional[str] = None, custom_prompt: Optional[str] = None, generate_docx: bool = True, existing_bank_path: Optional[str] = None, process_by_pages: bool = False, pages_per_chunk: int = 1, similarity_threshold: Optional[float] = 0.8, bank_prompt_scope: Optional[str] = None, prompt_example_content_type: str = "solo_enunciados", print_raw_gemini_answer: bool = False, max_generation_attempts_per_chunk: int = 3 ) -> pd.DataFrame: """ Main function to generate multiple-choice questions from multiple PDFs, with retries to reach the desired number of questions per chunk. This method orchestrates the entire generation process, including: - Loading an existing question bank for similarity checks. - Iterating through each provided PDF. - Splitting PDFs into chunks if required. - Looping with multiple attempts per chunk to reach the target number of questions. - Building prompts, generating content, and analyzing responses. - Saving accepted and rejected questions to files. Args: pdf_paths (List[str]): List of paths to the PDF files. prompt_type (str): The key for the predefined prompt type. num_questions_per_chunk_target (int): The target number of questions per text chunk. output_filename (Optional[str]): The base name for the output files. output_dir (Optional[str]): The directory to save the output files. If None, uses the current working directory. custom_prompt (Optional[str]): A user-provided custom prompt to override the default. generate_docx (bool): Whether to generate a DOCX file for review. existing_bank_path (Optional[str]): Path to an existing Excel question bank for similarity filtering. process_by_pages (bool): If True, process the PDF in chunks of pages. pages_per_chunk (int): The number of pages per chunk if processing by pages. similarity_threshold (Optional[float]): The threshold for the Jaccard similarity filter. bank_prompt_scope (Optional[str]): Scope for selecting guidance questions from the bank. prompt_example_content_type (str): Content type for guidance questions ('solo_enunciados' or 'enunciados_y_respuestas'). print_raw_gemini_answer (bool): If True, prints the raw API response for debugging. max_generation_attempts_per_chunk (int): The maximum number of attempts to reach the target per chunk. Returns: pd.DataFrame: A DataFrame containing all the successfully generated and filtered questions. """ if output_filename is None: output_filename = 'preguntas_pendiente_de_revisar' if output_dir is None: output_dir = os.getcwd() os.makedirs(output_dir, exist_ok=True) self.final_df = pd.DataFrame() self.rejected_questions_list = [] # Accumulates all rejected questions if similarity_threshold is not None and 0 <= similarity_threshold <= 1.0: self.similarity_threshold = similarity_threshold print(f"Filtro de similitud activado con umbral: {self.similarity_threshold}") else: self.similarity_threshold = 1.1 # Effectively disables the filter print(f"Filtro de similitud DESACTIVADO (umbral: {self.similarity_threshold}).") self.current_prompt_example_content_type = prompt_example_content_type print(f"Contenido de ejemplos en prompt: {self.current_prompt_example_content_type}") if existing_bank_path and os.path.exists(existing_bank_path): try: self.existing_bank_df = pd.read_excel(existing_bank_path) print(f"Banco de preguntas existente cargado desde '{existing_bank_path}' para el filtro de similitud.") except Exception as e: print(f"Error al cargar el banco de preguntas existente: {e}") self.existing_bank_df = None else: self.existing_bank_df = None if existing_bank_path: # Only show warning if a path was provided print(f"Advertencia: El archivo del banco de preguntas existente no se encontró en '{existing_bank_path}'.") for pdf_path in pdf_paths: print(f"\n--- Procesando PDF: {pdf_path} ---") pdf_full_text_content = self.extract_pdf_text(pdf_path) current_pdf_topic = self.extract_topic_number_from_path(pdf_path) if pdf_full_text_content: all_questions_for_this_pdf = [] text_chunks_to_process = [] context_for_chunks = None if process_by_pages and hasattr(self, 'page_texts') and self.page_texts: text_chunks_to_process = self._group_pages_into_chunks(pages_per_chunk) context_for_chunks = pdf_full_text_content print(f"PDF dividido en {len(text_chunks_to_process)} fragmentos de ~{pages_per_chunk} página(s).") else: text_chunks_to_process = [pdf_full_text_content] print("Procesando el PDF completo como un solo fragmento.") for i, chunk_text in enumerate(text_chunks_to_process): print(f"\nProcesando fragmento {i + 1}/{len(text_chunks_to_process)} del PDF...") questions_accepted_for_this_chunk = [] # This list will hold question dicts attempts_for_this_chunk = 0 while len(questions_accepted_for_this_chunk) < num_questions_per_chunk_target and \ attempts_for_this_chunk < max_generation_attempts_per_chunk: attempts_for_this_chunk += 1 num_still_needed = num_questions_per_chunk_target - len(questions_accepted_for_this_chunk) print(f" Intento {attempts_for_this_chunk}/{max_generation_attempts_per_chunk} para el fragmento: Se necesitan {num_still_needed} preguntas más.") prompt = self._build_prompt( working_chunk_text=chunk_text, num_questions_to_generate=num_still_needed, prompt_type=prompt_type, custom_prompt=custom_prompt, existing_questions_df=self.existing_bank_df, current_topic=current_pdf_topic, full_document_context_text=context_for_chunks if process_by_pages else None, bank_prompt_scope=bank_prompt_scope, questions_from_current_chunk_attempts=questions_accepted_for_this_chunk ) try: response = self.model.generate_content(prompt) if print_raw_gemini_answer: print(f" Respuesta cruda de Gemini (Fragmento {i + 1}, Intento {attempts_for_this_chunk}):\n >>>\n{response.text}\n <<<\n ---") time.sleep(1) newly_accepted_this_attempt = self._analyze_gemini_response(response.text, current_pdf_topic) # Filter out duplicates that might have been generated within the same chunk attempt unique_newly_accepted = [] for q_new in newly_accepted_this_attempt: is_dup_within_chunk = False for q_existing_chunk_dict in questions_accepted_for_this_chunk: # Compare only the statement for duplicates within the chunk if self._normalize_text(q_new.get("Pregunta", "")) == self._normalize_text(q_existing_chunk_dict.get("Pregunta", "")): is_dup_within_chunk = True break if not is_dup_within_chunk: unique_newly_accepted.append(q_new) questions_accepted_for_this_chunk.extend(unique_newly_accepted) print( f" Intento {attempts_for_this_chunk}: {len(unique_newly_accepted)} preguntas nuevas aceptadas para este fragmento. Total para fragmento: {len(questions_accepted_for_this_chunk)}/{num_questions_per_chunk_target}") except Exception as e_gen: error_str = str(e_gen) # Comprobamos si el error es por límite de cuota if "429" in error_str and "quota" in error_str: # Construimos el mensaje de error mejorado suggestion = ( "\n\nSugerencias para solucionarlo:\n" "1. Espera unos minutos: La cuota gratuita se reinicia cada cierto tiempo.\n" "2. Cambia de modelo: Los modelos 'Pro' son más potentes pero consumen la cuota más rápido. " "Prueba a cambiar a un modelo 'Flash' (como 'gemini-1.5-flash'), que es más rápido y económico." ) message = ( "Has excedido el límite de solicitudes a la API de Gemini (Error 429).\n" f"{suggestion}" ) # Lanzamos nuestra excepción personalizada con el mensaje mejorado raise QuotaExceededError(message) from e_gen else: # Si es otro error, lo imprimimos para depuración pero no detenemos el proceso print(f" Error durante la generación/análisis en el intento {attempts_for_this_chunk} para el fragmento {i + 1}: {e_gen}") import traceback print(traceback.format_exc()) if len(questions_accepted_for_this_chunk) < num_questions_per_chunk_target: print( f" Advertencia: No se alcanzó el objetivo de {num_questions_per_chunk_target} preguntas para el fragmento {i + 1} después de {max_generation_attempts_per_chunk} intentos. Se obtuvieron {len(questions_accepted_for_this_chunk)}.") all_questions_for_this_pdf.extend(questions_accepted_for_this_chunk) if all_questions_for_this_pdf: pdf_df = pd.DataFrame(all_questions_for_this_pdf) self.final_df = pd.concat([self.final_df, pdf_df], ignore_index=True) else: print(f"No se pudo extraer texto del PDF: {pdf_path}") excel_output_path = os.path.join(output_dir, f'{output_filename}_pendiente_de_revisar.xlsx') docx_output_path = os.path.join(output_dir, f'{output_filename}_pendiente_de_revisar.docx') self.save_questions_to_excel(self.final_df, excel_output_path) if generate_docx: self.save_questions_to_docx(self.final_df, docx_output_path) if self.rejected_questions_list: rejected_df = pd.DataFrame(self.rejected_questions_list) rejected_docx_path = os.path.join(output_dir, f'{output_filename}_similares_descartadas.docx') self.save_rejected_questions_to_docx(rejected_df, rejected_docx_path) else: print("No se descartaron preguntas por similitud durante todo el proceso.") print(f"\n--- Proceso de generación completado. ---") print(f"Total preguntas aceptadas y guardadas: {len(self.final_df)}") if self.rejected_questions_list: print(f"Total preguntas rechazadas por similitud (acumulado de todos los intentos): {len(self.rejected_questions_list)}") return self.final_df
# Example of use (uncomment to test) # if __name__ == '__main__': # api_key = "YOUR_GEMINI_API_KEY_HERE" # Replace with your Gemini API key # generator = QuestionGenerator(api_key) # pdf_files = ['TEMA 05_Inspecciones de seguridad.pdf', 'TEMA 08_Equipos de proteccion individual.pdf'] # # # Example 1: Normal processing (the whole text at once) # # questions_df = generator.generate_multiple_choice_questions( # # pdf_files, # # prompt_type="PRL", # # num_questions_per_chunk_target=10, # Generate 10 questions per PDF # # output_filename="banco_de_preguntas", # # generate_docx=True # # ) # # # Example 2: Processing by individual pages # # questions_df_pages = generator.generate_multiple_choice_questions( # # pdf_files, # # prompt_type="PRL", # # num_questions_per_chunk_target=2, # Generate 2 questions per page # # output_filename="banco_de_preguntas_por_paginas", # # generate_docx=True, # # process_by_pages=True, # Activate processing by pages # # pages_per_chunk=1, # Process each page individually # # existing_bank_path="examen_PIR.xlsx" # # ) # # # Example 3: Processing by groups of pages # questions_df_page_groups = generator.generate_multiple_choice_questions( # pdf_files, # prompt_type="PRL", # num_questions_per_chunk_target=3, # Generate 3 questions for each group of 1 page(s) # pages_per_chunk=1, # Group every 1 page to process them together # output_filename="banco_de_preguntas_grupos_paginas", # generate_docx=True, # process_by_pages=True, # Activate processing by pages # existing_bank_path=r'D:\path\to\your\existing_bank.xlsx', # bank_prompt_scope='todos_los_temas', # prompt_example_content_type='solo_enunciados' # ) # # # You can use the DataFrame to inspect the generated questions