# pyexamgenerator: A tool for generating exams from PDF files using AI.
# Copyright (C) 2024 Daniel Sánchez-García
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import google.generativeai as genai
import PyPDF2
import json
import re
import os
import pandas as pd
import time
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
from typing import List, Dict, Optional
[documentos]
class QuotaExceededError(Exception):
"""Excepción para cuando se excede el límite de cuota de la API de Gemini."""
pass
[documentos]
class QuestionGenerator:
"""
Generates multiple-choice questions from PDF files using the Gemini model.
"""
def __init__(self, api_key: str, model_name: str):
"""
Initializes the QuestionGenerator class.
Args:
api_key (str): The API key for the Gemini model.
model_name (str): The name of the Gemini model to use (e.g., 'models/gemini-1.5-pro').
This is a required argument.
"""
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel(model_name)
print(f"Modelo de Gemini inicializado: {model_name}")
self.prompt_types = {
"PRL": """
Genera preguntas tipo test de opción múltiple enfocadas en Prevención de Riesgos Laborales (PRL), basadas en el siguiente texto, siguiendo estas instrucciones iniciales:
1. **Centrarse en leyes y reglamentos:** Las preguntas deben poder entenderse por sí mismas, haciendo referencia únicamente a leyes o reglamentos relevantes. No se puede poner: "Según el texto"
2. **Evita preguntas sobre números de artículos o códigos:** No incluyas preguntas que hagan referencia a números de artículos, códigos de leyes o reglamentos.
Además, ten en cuenta las siguientes indicaciones adicionales
""",
"PM": """
Genera preguntas tipo test de opción múltiple enfocadas en Gestión de Proyectos.
"""
}
self.existing_bank_df = None
self.similarity_threshold = 0.8 # Default threshold
self.current_prompt_example_content_type = "solo_enunciados" # Default value
def _group_pages_into_chunks(self, pages_per_chunk: int) -> List[str]:
"""
Groups the text from pages into larger chunks.
Args:
pages_per_chunk (int): The number of pages to include in each chunk.
Returns:
List[str]: A list of strings, where each string is a text chunk.
"""
if not hasattr(self, 'page_texts') or not self.page_texts:
return []
text_chunks = []
current_chunk_pages = []
for i, page_text in enumerate(self.page_texts):
current_chunk_pages.append(page_text)
if (i + 1) % pages_per_chunk == 0 or (i + 1) == len(self.page_texts):
text_chunks.append("\n".join(current_chunk_pages))
current_chunk_pages = []
return text_chunks
def _build_prompt(
self,
working_chunk_text: str,
num_questions_to_generate: int,
prompt_type: str,
custom_prompt: Optional[str],
existing_questions_df: Optional[pd.DataFrame],
current_topic: str,
full_document_context_text: Optional[str] = None,
bank_prompt_scope: Optional[str] = None,
questions_from_current_chunk_attempts: Optional[List[Dict]] = None
) -> str:
"""
Constructs the complete prompt to be sent to the Gemini model.
Args:
working_chunk_text (str): The specific text segment to generate questions from.
num_questions_to_generate (int): The target number of questions to generate.
prompt_type (str): The key for the predefined prompt type (e.g., "PRL").
custom_prompt (Optional[str]): A user-provided custom prompt.
existing_questions_df (Optional[pd.DataFrame]): DataFrame of existing questions for guidance.
current_topic (str): The topic of the current PDF.
full_document_context_text (Optional[str]): The full text of the document for context.
bank_prompt_scope (Optional[str]): Scope for selecting guidance questions ('mismo_tema', 'todos_los_temas').
questions_from_current_chunk_attempts (Optional[List[Dict]]): Questions already generated for this chunk.
Returns:
str: The fully constructed prompt string.
"""
user_prefix = ""
if custom_prompt:
user_prefix = custom_prompt
elif prompt_type in self.prompt_types:
user_prefix = self.prompt_types[prompt_type]
else:
user_prefix = """
Genera preguntas tipo test de opción múltiple basadas en el siguiente texto, siguiendo estas instrucciones:
"""
fixed_prefix = f"""
Genera {num_questions_to_generate} preguntas tipo test de opción múltiple con una única
respuesta correcta basadas en el texto que adjunto al final, siguiendo estas instrucciones:
"""
# --- Guidance from the existing question bank ---
prompt_guidance_from_bank_str = ""
if existing_questions_df is not None and \
not existing_questions_df.empty and \
bank_prompt_scope and bank_prompt_scope != "no_incluir_en_prompt":
questions_for_guidance_df = pd.DataFrame()
header_for_bank_guidance = ""
if bank_prompt_scope == "mismo_tema":
if 'Tema' in existing_questions_df.columns:
questions_for_guidance_df = existing_questions_df[
existing_questions_df['Tema'].astype(str).str.lower() == current_topic.lower()
]
if not questions_for_guidance_df.empty:
header_for_bank_guidance = "\n\nPara evitar redundancia con el BANCO DE PREGUNTAS EXISTENTE, considera las siguientes preguntas del MISMO TEMA (asegúrate de que las nuevas preguntas sean originales y basadas estrictamente en el FRAGMENTO DE TRABAJO proporcionado):\n"
elif bank_prompt_scope == "todos_los_temas":
questions_for_guidance_df = existing_questions_df
if not questions_for_guidance_df.empty:
header_for_bank_guidance = "\n\nPara evitar redundancia con el BANCO DE PREGUNTAS EXISTENTE, considera las siguientes preguntas de TODO EL BANCO (asegúrate de que las nuevas preguntas sean originales y basadas estrictamente en el TEXTO COMPLETO proporcionado):\n"
if not questions_for_guidance_df.empty:
prompt_guidance_from_bank_str += header_for_bank_guidance
for _, row in questions_for_guidance_df.iterrows():
question_text = row.get('Pregunta', '')
prompt_guidance_from_bank_str += f"- Pregunta del banco: {question_text}\n"
if self.current_prompt_example_content_type == "enunciados_y_respuestas":
options = [row.get(f'Respuesta {chr(65 + i)}', '') for i in range(4) if pd.notna(row.get(f'Respuesta {chr(65 + i)}', ''))]
options_str = "\n".join([f" - {opt}" for opt in options if opt])
if options_str:
prompt_guidance_from_bank_str += f"{options_str}\n"
prompt_guidance_from_bank_str += "\n"
# --- Guidance from questions already generated for this chunk in previous attempts ---
prompt_guidance_from_current_attempts_str = ""
if questions_from_current_chunk_attempts: # This is a list of dictionaries
prompt_guidance_from_current_attempts_str += "\n\nAdicionalmente, para evitar generar preguntas idénticas o muy similares a las YA GENERADAS PARA ESTE MISMO FRAGMENTO en intentos previos, ten en cuenta las siguientes preguntas que ya se han aceptado:\n"
for question_dict in questions_from_current_chunk_attempts:
question_text = question_dict.get('Pregunta', '')
if question_text:
prompt_guidance_from_current_attempts_str += f"- Pregunta ya generada para este fragmento: {question_text}\n"
# Optionally, you could also include the options if you find it useful
# if self.current_prompt_example_content_type == "enunciados_y_respuestas":
# options = [question_dict.get(f'Respuesta {chr(65 + i)}', '') for i in range(4)]
# options_str = "\n".join([f" - {opt}" for opt in options if opt])
# if options_str:
# prompt_guidance_from_current_attempts_str += f"{options_str}\n"
prompt_guidance_from_current_attempts_str += "\n"
context_section = ""
chunk_instruction_and_text_section = ""
if full_document_context_text is not None:
context_section = f"""
Aquí está el **CONTEXTO COMPLETO** del documento, que te proporcionamos para que tengas una comprensión general del tema y el documento en su totalidad. Puedes referenciar este texto para obtener contexto adicional, pero las preguntas deben basarse estrictamente en el **FRAGMENTO DE TRABAJO**.
{full_document_context_text}
"""
chunk_instruction_and_text_section = f"""
---
**IMPORTANTE**: Debes generar las preguntas **ÚNICAMENTE** basándote en el siguiente "**FRAGMENTO DE TRABAJO**". La respuesta a cada pregunta debe ser directamente derivable del contenido de este FRAGMENTO DE TRABAJO.
---
Aquí está el FRAGMENTO DE TRABAJO sobre el que debes generar las preguntas:
{working_chunk_text}
"""
else:
chunk_instruction_and_text_section = f"""
Aquí está el **TEXTO COMPLETO** sobre el que debes generar las preguntas:
{working_chunk_text}
"""
prompt = f"""
{user_prefix}
{fixed_prefix}
1. **Incluye la respuesta correcta:** Para cada pregunta, indica cuál de las opciones es la respuesta correcta (solo la letra, sin paréntesis).
2. **Proporciona el texto relevante:** Incluye un fragmento del texto del PDF donde se encuentra la justificación de la respuesta correcta.
3. **Formato de diccionario:** Presenta las preguntas y respuestas en formato de diccionario de Python, donde cada diccionario representa una pregunta y tiene las siguientes claves:
* "Pregunta"
* "Opciones" (una lista de las opciones A, B, C, D, sin la letra inicial y el paréntesis)
* "Respuesta correcta" (la letra de la opción correcta, sin paréntesis)
* "Texto relevante" (el texto relevante del PDF)
Separa cada pregunta con una línea en blanco.
Puedes incluir preguntas del tipo: señala la afirmación incorrecta entre las siguientes opciones; o señala la opción correcta entre las siguientes.
Además, puedes incluir respuestas como todas son correctas, o todas son incorrectas.
{prompt_guidance_from_bank_str}
{prompt_guidance_from_current_attempts_str}
{context_section}
{chunk_instruction_and_text_section}
"""
return prompt
def _analyze_gemini_response(self, response_text: str, topic_number: str) -> List[Dict]:
"""
Analyzes the model's response, filters it by similarity, and returns a list of dictionaries with accepted questions.
Rejected questions are added to the instance's `self.rejected_questions_list`.
Args:
response_text (str): The raw text response from the Gemini model.
topic_number (str): The topic associated with these questions.
Returns:
List[Dict]: A list of accepted question dictionaries.
"""
accepted_questions = []
# The model is asked to separate question dictionaries with a blank line.
question_strs = response_text.split('\n\n')
for question_str in question_strs:
cleaned_str = question_str.strip()
# Attempt to find a Python dictionary-like structure within the string.
match = re.search(r'\{(.*?)\}', cleaned_str, re.DOTALL)
if not match:
continue
json_like_str = "{" + match.group(1) + "}"
try:
# Use eval() to parse the string as a Python dictionary.
question_data = eval(json_like_str)
if not isinstance(question_data, dict) or "Pregunta" not in question_data:
continue
# Check for similarity against the existing bank.
is_similar, sim_score, matched_text = self._is_similar_question(question_data["Pregunta"])
# Build the base dictionary for the generated question.
question_dict = {
"Tema": f"{topic_number}",
"Estado": "Pendiente de revisar",
"Pregunta": question_data.get("Pregunta", ""),
"Respuesta A": question_data.get("Opciones", [""] * 4)[0],
"Respuesta B": question_data.get("Opciones", [""] * 4)[1],
"Respuesta C": question_data.get("Opciones", [""] * 4)[2],
"Respuesta D": question_data.get("Opciones", [""] * 4)[3],
"Respuesta correcta": question_data.get("Respuesta correcta", "").strip().upper(),
"Texto relevante": question_data.get("Texto relevante", "")
}
if is_similar:
# If similar, add it to the rejected list with extra details for reporting.
question_dict['Similitud'] = sim_score
question_dict['Pregunta Coincidente'] = matched_text
self.rejected_questions_list.append(question_dict)
else:
# If not similar, add it to the accepted list for this batch.
accepted_questions.append(question_dict)
except (SyntaxError, TypeError, NameError):
print(f"Error al evaluar la estructura como diccionario: {json_like_str}")
return accepted_questions
[documentos]
def save_questions_to_excel(
self,
df: pd.DataFrame,
filepath: str
) -> None:
"""Saves the questions to an Excel file."""
if not df.empty:
if 'Número de pregunta' not in df.columns:
df.insert(0, 'Número de pregunta', range(1, len(df) + 1))
df.to_excel(filepath, index=False)
print(f"Preguntas aceptadas guardadas en: '{filepath}'")
else:
print("No se generaron preguntas aceptadas para guardar en Excel.")
[documentos]
def save_questions_to_docx(
self,
df: pd.DataFrame,
filepath: str
) -> None:
"""Saves the questions to a DOCX file for manual review."""
if not df.empty:
document = Document()
document.add_heading('Banco de Preguntas - Pendiente de Revisar', level=1)
for index, row in df.iterrows():
pregunta_paragraph = document.add_paragraph()
pregunta_run = pregunta_paragraph.add_run(f"Pregunta {index + 1}: {row['Pregunta']}")
pregunta_run.bold = True
document.add_paragraph(f"Tema: {row['Tema']}")
document.add_paragraph(f"Estado: {row['Estado']}")
document.add_paragraph(f"A) {row['Respuesta A']}")
document.add_paragraph(f"B) {row['Respuesta B']}")
document.add_paragraph(f"C) {row['Respuesta C']}")
document.add_paragraph(f"D) {row['Respuesta D']}")
document.add_paragraph(f"Respuesta correcta: {row['Respuesta correcta']}")
document.add_paragraph(f"Texto relevante: {row['Texto relevante']}")
document.add_paragraph()
document.save(filepath)
else:
print("No hay preguntas para guardar en DOCX.")
[documentos]
def save_rejected_questions_to_docx(self, df: pd.DataFrame, filepath: str) -> None:
"""
Saves questions discarded due to similarity to a DOCX file for inspection.
Args:
df (pd.DataFrame): The DataFrame containing the discarded questions and similarity details.
filepath (str): The filepath for the output file.
"""
if df.empty:
print("No se descartaron preguntas por similitud.")
return
document = Document()
document.add_heading('Preguntas Descartadas por Similitud', level=1)
document.add_paragraph(
"Las siguientes preguntas fueron generadas por el modelo, pero se descartaron "
"automáticamente porque su similitud con una pregunta existente en el banco "
"superó el umbral establecido."
)
for index, row in df.iterrows():
# Visual separator between questions
document.add_paragraph("---")
# Information about the discarded question
p = document.add_paragraph()
p.add_run('Pregunta descartada: ').bold = True
p.add_run(f"{row['Pregunta']}")
document.add_paragraph(f"Tema: {row['Tema']}")
document.add_paragraph(f"A) {row['Respuesta A']}")
document.add_paragraph(f"B) {row['Respuesta B']}")
document.add_paragraph(f"C) {row['Respuesta C']}")
document.add_paragraph(f"D) {row['Respuesta D']}")
document.add_paragraph(f"Respuesta correcta: {row['Respuesta correcta']}")
# Information about the reason for rejection
p_reason = document.add_paragraph()
p_reason.add_run('Motivo del descarte:').bold = True
# Format the similarity score as a percentage
similarity_score_pct = f"{row.get('Similitud', 0.0) * 100:.1f}%"
p_details = document.add_paragraph()
p_details.add_run(f"Similitud: ").bold = True
p_details.add_run(f"{similarity_score_pct}")
p_match = document.add_paragraph()
p_match.add_run("Coincide con la pregunta existente: ").bold = True
p_match.add_run(f"{row.get('Pregunta Coincidente', 'N/A')}")
document.save(filepath)
print(f"Se guardó un informe de {len(df)} preguntas descartadas en: '{filepath}'")
def _normalize_text(self, text: str) -> str:
"""Normalizes text for comparison (lowercase, no punctuation)."""
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
return text
def _calculate_jaccard_similarity(self, text1: str, text2: str) -> float:
"""Calculates the Jaccard similarity between two texts."""
words1 = set(self._normalize_text(text1).split())
words2 = set(self._normalize_text(text2).split())
if not words1 and not words2:
return 1.0
if not words1 or not words2:
return 0.0
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
return intersection / union
def _is_similar_question(self,
new_question_text: str
) -> tuple[bool, float, Optional[str]]:
"""
Checks if the new question is similar to any existing one in the bank,
comparing ONLY the question statements for filtering purposes.
Args:
new_question_text (str): The text of the new question.
Returns:
tuple[bool, float, Optional[str]]: (is_similar, similarity_score, matching_question_text)
"""
if self.existing_bank_df is None or self.existing_bank_df.empty or self.similarity_threshold > 1.0:
# If the threshold is > 1.0 (e.g., 1.1), the filter is effectively disabled
return (False, 0.0, None)
normalized_new_question_text = self._normalize_text(new_question_text)
for _, row in self.existing_bank_df.iterrows():
existing_question_text_from_bank = row.get('Pregunta')
if existing_question_text_from_bank and isinstance(existing_question_text_from_bank, str):
normalized_existing_question_text = self._normalize_text(existing_question_text_from_bank)
question_similarity = self._calculate_jaccard_similarity(
normalized_new_question_text,
normalized_existing_question_text
)
if question_similarity >= self.similarity_threshold:
# For filtering, the similarity of the statement is sufficient
return (True, question_similarity, existing_question_text_from_bank)
return (False, 0.0, None)
[documentos]
def generate_multiple_choice_questions(
self,
pdf_paths: List[str],
prompt_type: str,
num_questions_per_chunk_target: int = 5,
output_filename: Optional[str] = None,
output_dir: Optional[str] = None,
custom_prompt: Optional[str] = None,
generate_docx: bool = True,
existing_bank_path: Optional[str] = None,
process_by_pages: bool = False,
pages_per_chunk: int = 1,
similarity_threshold: Optional[float] = 0.8,
bank_prompt_scope: Optional[str] = None,
prompt_example_content_type: str = "solo_enunciados",
print_raw_gemini_answer: bool = False,
max_generation_attempts_per_chunk: int = 3
) -> pd.DataFrame:
"""
Main function to generate multiple-choice questions from multiple PDFs,
with retries to reach the desired number of questions per chunk.
This method orchestrates the entire generation process, including:
- Loading an existing question bank for similarity checks.
- Iterating through each provided PDF.
- Splitting PDFs into chunks if required.
- Looping with multiple attempts per chunk to reach the target number of questions.
- Building prompts, generating content, and analyzing responses.
- Saving accepted and rejected questions to files.
Args:
pdf_paths (List[str]): List of paths to the PDF files.
prompt_type (str): The key for the predefined prompt type.
num_questions_per_chunk_target (int): The target number of questions per text chunk.
output_filename (Optional[str]): The base name for the output files.
output_dir (Optional[str]): The directory to save the output files.
If None, uses the current working directory.
custom_prompt (Optional[str]): A user-provided custom prompt to override the default.
generate_docx (bool): Whether to generate a DOCX file for review.
existing_bank_path (Optional[str]): Path to an existing Excel question bank for similarity filtering.
process_by_pages (bool): If True, process the PDF in chunks of pages.
pages_per_chunk (int): The number of pages per chunk if processing by pages.
similarity_threshold (Optional[float]): The threshold for the Jaccard similarity filter.
bank_prompt_scope (Optional[str]): Scope for selecting guidance questions from the bank.
prompt_example_content_type (str): Content type for guidance questions ('solo_enunciados' or 'enunciados_y_respuestas').
print_raw_gemini_answer (bool): If True, prints the raw API response for debugging.
max_generation_attempts_per_chunk (int): The maximum number of attempts to reach the target per chunk.
Returns:
pd.DataFrame: A DataFrame containing all the successfully generated and filtered questions.
"""
if output_filename is None:
output_filename = 'preguntas_pendiente_de_revisar'
if output_dir is None:
output_dir = os.getcwd()
os.makedirs(output_dir, exist_ok=True)
self.final_df = pd.DataFrame()
self.rejected_questions_list = [] # Accumulates all rejected questions
if similarity_threshold is not None and 0 <= similarity_threshold <= 1.0:
self.similarity_threshold = similarity_threshold
print(f"Filtro de similitud activado con umbral: {self.similarity_threshold}")
else:
self.similarity_threshold = 1.1 # Effectively disables the filter
print(f"Filtro de similitud DESACTIVADO (umbral: {self.similarity_threshold}).")
self.current_prompt_example_content_type = prompt_example_content_type
print(f"Contenido de ejemplos en prompt: {self.current_prompt_example_content_type}")
if existing_bank_path and os.path.exists(existing_bank_path):
try:
self.existing_bank_df = pd.read_excel(existing_bank_path)
print(f"Banco de preguntas existente cargado desde '{existing_bank_path}' para el filtro de similitud.")
except Exception as e:
print(f"Error al cargar el banco de preguntas existente: {e}")
self.existing_bank_df = None
else:
self.existing_bank_df = None
if existing_bank_path: # Only show warning if a path was provided
print(f"Advertencia: El archivo del banco de preguntas existente no se encontró en '{existing_bank_path}'.")
for pdf_path in pdf_paths:
print(f"\n--- Procesando PDF: {pdf_path} ---")
pdf_full_text_content = self.extract_pdf_text(pdf_path)
current_pdf_topic = self.extract_topic_number_from_path(pdf_path)
if pdf_full_text_content:
all_questions_for_this_pdf = []
text_chunks_to_process = []
context_for_chunks = None
if process_by_pages and hasattr(self, 'page_texts') and self.page_texts:
text_chunks_to_process = self._group_pages_into_chunks(pages_per_chunk)
context_for_chunks = pdf_full_text_content
print(f"PDF dividido en {len(text_chunks_to_process)} fragmentos de ~{pages_per_chunk} página(s).")
else:
text_chunks_to_process = [pdf_full_text_content]
print("Procesando el PDF completo como un solo fragmento.")
for i, chunk_text in enumerate(text_chunks_to_process):
print(f"\nProcesando fragmento {i + 1}/{len(text_chunks_to_process)} del PDF...")
questions_accepted_for_this_chunk = [] # This list will hold question dicts
attempts_for_this_chunk = 0
while len(questions_accepted_for_this_chunk) < num_questions_per_chunk_target and \
attempts_for_this_chunk < max_generation_attempts_per_chunk:
attempts_for_this_chunk += 1
num_still_needed = num_questions_per_chunk_target - len(questions_accepted_for_this_chunk)
print(f" Intento {attempts_for_this_chunk}/{max_generation_attempts_per_chunk} para el fragmento: Se necesitan {num_still_needed} preguntas más.")
prompt = self._build_prompt(
working_chunk_text=chunk_text,
num_questions_to_generate=num_still_needed,
prompt_type=prompt_type,
custom_prompt=custom_prompt,
existing_questions_df=self.existing_bank_df,
current_topic=current_pdf_topic,
full_document_context_text=context_for_chunks if process_by_pages else None,
bank_prompt_scope=bank_prompt_scope,
questions_from_current_chunk_attempts=questions_accepted_for_this_chunk
)
try:
response = self.model.generate_content(prompt)
if print_raw_gemini_answer:
print(f" Respuesta cruda de Gemini (Fragmento {i + 1}, Intento {attempts_for_this_chunk}):\n >>>\n{response.text}\n <<<\n ---")
time.sleep(1)
newly_accepted_this_attempt = self._analyze_gemini_response(response.text, current_pdf_topic)
# Filter out duplicates that might have been generated within the same chunk attempt
unique_newly_accepted = []
for q_new in newly_accepted_this_attempt:
is_dup_within_chunk = False
for q_existing_chunk_dict in questions_accepted_for_this_chunk:
# Compare only the statement for duplicates within the chunk
if self._normalize_text(q_new.get("Pregunta", "")) == self._normalize_text(q_existing_chunk_dict.get("Pregunta", "")):
is_dup_within_chunk = True
break
if not is_dup_within_chunk:
unique_newly_accepted.append(q_new)
questions_accepted_for_this_chunk.extend(unique_newly_accepted)
print(
f" Intento {attempts_for_this_chunk}: {len(unique_newly_accepted)} preguntas nuevas aceptadas para este fragmento. Total para fragmento: {len(questions_accepted_for_this_chunk)}/{num_questions_per_chunk_target}")
except Exception as e_gen:
error_str = str(e_gen)
# Comprobamos si el error es por límite de cuota
if "429" in error_str and "quota" in error_str:
# Construimos el mensaje de error mejorado
suggestion = (
"\n\nSugerencias para solucionarlo:\n"
"1. Espera unos minutos: La cuota gratuita se reinicia cada cierto tiempo.\n"
"2. Cambia de modelo: Los modelos 'Pro' son más potentes pero consumen la cuota más rápido. "
"Prueba a cambiar a un modelo 'Flash' (como 'gemini-1.5-flash'), que es más rápido y económico."
)
message = (
"Has excedido el límite de solicitudes a la API de Gemini (Error 429).\n"
f"{suggestion}"
)
# Lanzamos nuestra excepción personalizada con el mensaje mejorado
raise QuotaExceededError(message) from e_gen
else:
# Si es otro error, lo imprimimos para depuración pero no detenemos el proceso
print(f" Error durante la generación/análisis en el intento {attempts_for_this_chunk} para el fragmento {i + 1}: {e_gen}")
import traceback
print(traceback.format_exc())
if len(questions_accepted_for_this_chunk) < num_questions_per_chunk_target:
print(
f" Advertencia: No se alcanzó el objetivo de {num_questions_per_chunk_target} preguntas para el fragmento {i + 1} después de {max_generation_attempts_per_chunk} intentos. Se obtuvieron {len(questions_accepted_for_this_chunk)}.")
all_questions_for_this_pdf.extend(questions_accepted_for_this_chunk)
if all_questions_for_this_pdf:
pdf_df = pd.DataFrame(all_questions_for_this_pdf)
self.final_df = pd.concat([self.final_df, pdf_df], ignore_index=True)
else:
print(f"No se pudo extraer texto del PDF: {pdf_path}")
excel_output_path = os.path.join(output_dir, f'{output_filename}_pendiente_de_revisar.xlsx')
docx_output_path = os.path.join(output_dir, f'{output_filename}_pendiente_de_revisar.docx')
self.save_questions_to_excel(self.final_df, excel_output_path)
if generate_docx:
self.save_questions_to_docx(self.final_df, docx_output_path)
if self.rejected_questions_list:
rejected_df = pd.DataFrame(self.rejected_questions_list)
rejected_docx_path = os.path.join(output_dir, f'{output_filename}_similares_descartadas.docx')
self.save_rejected_questions_to_docx(rejected_df, rejected_docx_path)
else:
print("No se descartaron preguntas por similitud durante todo el proceso.")
print(f"\n--- Proceso de generación completado. ---")
print(f"Total preguntas aceptadas y guardadas: {len(self.final_df)}")
if self.rejected_questions_list:
print(f"Total preguntas rechazadas por similitud (acumulado de todos los intentos): {len(self.rejected_questions_list)}")
return self.final_df
# Example of use (uncomment to test)
# if __name__ == '__main__':
# api_key = "YOUR_GEMINI_API_KEY_HERE" # Replace with your Gemini API key
# generator = QuestionGenerator(api_key)
# pdf_files = ['TEMA 05_Inspecciones de seguridad.pdf', 'TEMA 08_Equipos de proteccion individual.pdf']
#
# # Example 1: Normal processing (the whole text at once)
# # questions_df = generator.generate_multiple_choice_questions(
# # pdf_files,
# # prompt_type="PRL",
# # num_questions_per_chunk_target=10, # Generate 10 questions per PDF
# # output_filename="banco_de_preguntas",
# # generate_docx=True
# # )
#
# # Example 2: Processing by individual pages
# # questions_df_pages = generator.generate_multiple_choice_questions(
# # pdf_files,
# # prompt_type="PRL",
# # num_questions_per_chunk_target=2, # Generate 2 questions per page
# # output_filename="banco_de_preguntas_por_paginas",
# # generate_docx=True,
# # process_by_pages=True, # Activate processing by pages
# # pages_per_chunk=1, # Process each page individually
# # existing_bank_path="examen_PIR.xlsx"
# # )
#
# # Example 3: Processing by groups of pages
# questions_df_page_groups = generator.generate_multiple_choice_questions(
# pdf_files,
# prompt_type="PRL",
# num_questions_per_chunk_target=3, # Generate 3 questions for each group of 1 page(s)
# pages_per_chunk=1, # Group every 1 page to process them together
# output_filename="banco_de_preguntas_grupos_paginas",
# generate_docx=True,
# process_by_pages=True, # Activate processing by pages
# existing_bank_path=r'D:\path\to\your\existing_bank.xlsx',
# bank_prompt_scope='todos_los_temas',
# prompt_example_content_type='solo_enunciados'
# )
#
# # You can use the DataFrame to inspect the generated questions