Código fuente para pyexamgenerator.question_bank_manager

# pyexamgenerator: A tool for generating exams from PDF files using AI.
# Copyright (C) 2024 Daniel Sánchez-García

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import pandas as pd
from docx import Document
import os
from typing import Optional, List, Tuple


[documentos]
class QuestionBankManager:
    """
    Manages a question bank, allowing reading from DOCX files,
    saving to Excel, and adding new questions while avoiding duplicates.
    """
    #TODO: review how the transformation from revised docx to xlsx, etc. is used in the code.

[documentos]
    def read_questions_from_docx(self, docx_path: str) -> Optional[pd.DataFrame]:
        """
        Reads revised questions from a DOCX file and returns a pandas DataFrame.
        The DOCX file must follow a specific format where each piece of information
        (question, topic, state, options, etc.) is prefixed with a specific keyword.

        Args:
            docx_path (str): The path to the DOCX file containing the questions.

        Returns:
            Optional[pd.DataFrame]: A pandas DataFrame with the extracted questions.
                                    Returns None if an error occurs while reading the file.
        """
        try:
            document = Document(docx_path)
            questions = []
            current_question = {}
            state = None  # To track what information we are reading

            for paragraph in document.paragraphs:
                text = paragraph.text.strip()
                if text.startswith("Pregunta "):
                    if current_question:
                        questions.append(current_question)
                    current_question = {"Número de pregunta": text.split(":")[0].replace("Pregunta ", "").strip(), "Pregunta": text.split(":")[1].strip()}
                    state = "pregunta"
                elif text.startswith("Tema:"):
                    current_question["Tema"] = text.replace("Tema:", "").strip()
                    state = "tema"
                elif text.startswith("Estado:"):
                    current_question["Estado"] = text.replace("Estado:", "").strip()
                    state = "estado"
                elif text.startswith("A)"):
                    current_question["Respuesta A"] = text.replace("A)", "").strip()
                    state = "respuesta_a"
                elif text.startswith("B)"):
                    current_question["Respuesta B"] = text.replace("B)", "").strip()
                    state = "respuesta_b"
                elif text.startswith("C)"):
                    current_question["Respuesta C"] = text.replace("C)", "").strip()
                    state = "respuesta_c"
                elif text.startswith("D)"):
                    current_question["Respuesta D"] = text.replace("D)", "").strip()
                    state = "respuesta_d"
                elif text.startswith("Respuesta correcta:"):
                    current_question["Respuesta correcta"] = text.replace("Respuesta correcta:", "").strip().upper()
                    state = "respuesta_correcta"
                elif text.startswith("Texto relevante:"):
                    current_question["Texto relevante"] = text.replace("Texto relevante:", "").strip()
                    state = "texto_relevante"

            if current_question:
                questions.append(current_question)

            df = pd.DataFrame(questions)
            # Ensure column order for consistency
            expected_columns = ['Número de pregunta', 'Tema', 'Estado', 'Pregunta', 'Respuesta A', 'Respuesta B', 'Respuesta C', 'Respuesta D', 'Respuesta correcta', 'Texto relevante']
            df = df.reindex(columns=expected_columns)
            return df

        except Exception as e:
            print(f"Error al leer el archivo DOCX: {e}")
            return None



[documentos]
    def save_questions_to_excel(self, df: pd.DataFrame, output_filename: str = 'preguntas_revisadas') -> None:
        """
        Saves the question DataFrame to an Excel file.

        Args:
            df (pd.DataFrame): The pandas DataFrame containing the questions to save.
            output_filename (str): The name of the output Excel file (without the extension).
                                   Defaults to 'preguntas_revisadas'.
        """
        if not df.empty:
            print("Guardando preguntas revisadas en:", f'{output_filename}.xlsx')
            df.to_excel(f'{output_filename}.xlsx', index=False)
            print(f"Preguntas revisadas guardadas en '{output_filename}.xlsx'")
        else:
            print("No hay preguntas revisadas para guardar.")



[documentos]
    def add_questions_without_duplicates(
            self,
            existing_bank_path: str,
            reviewed_questions_path: str,
            duplicate_check_columns: Optional[List[str]] = ['Pregunta', 'Respuesta A', 'Respuesta B', 'Respuesta C', 'Respuesta D'],
            add_only_acceptable: bool = False
    ) -> Tuple[int, Optional[pd.DataFrame]]:
        """
        Adds questions from a reviewed Excel file to an existing bank, avoiding duplicates.
        It can filter questions by their 'Estado' (State) column.

        Args:
            existing_bank_path (str): Path to the existing question bank Excel file.
            reviewed_questions_path (str): Path to the Excel file with revised questions to add.
            duplicate_check_columns (Optional[List[str]]): List of column names used to
                                                            identify duplicate questions.
                                                            Defaults to checking question and all answers.
            add_only_acceptable (bool, optional): If True, only adds questions with 'Aceptable' status.
                                                  Defaults to False (adds all).

        Returns:
            Tuple[int, Optional[pd.DataFrame]]: A tuple containing the number of questions added and the updated
                                                question bank DataFrame. Returns (-1, None) if there are
                                                errors reading the files.
        """
        try:
            df_existing = pd.read_excel(existing_bank_path)
            df_reviewed = pd.read_excel(reviewed_questions_path)
        except FileNotFoundError:
            print(f"Error: Uno o ambos archivos no se encontraron.")
            return -1, None
        except Exception as e:
            print(f"Error al leer los archivos Excel: {e}")
            return -1, None

        # Block to filter the reviewed questions DataFrame if the option is enabled.
        if add_only_acceptable:
            print("Filtrando para añadir solo preguntas con estado 'Aceptable'.")
            if 'Estado' in df_reviewed.columns:
                original_count = len(df_reviewed)
                df_reviewed = df_reviewed[df_reviewed['Estado'] == 'Aceptable'].copy()
                print(f"Se encontraron {len(df_reviewed)} preguntas 'Aceptable' de un total de {original_count}.")
                if df_reviewed.empty:
                    print("No se encontraron preguntas con estado 'Aceptable' para añadir.")
            else:
                print("Advertencia: La columna 'Estado' no se encontró en el archivo a añadir. No se pudo aplicar el filtro.")

        added_count = 0
        existing_count = len(df_existing)

        # Iterate through each question in the reviewed file
        for index, row_reviewed in df_reviewed.iterrows():
            is_duplicate = False
            # Compare it against every question in the existing bank
            for index_existing, row_existing in df_existing.iterrows():
                match = True
                # Check all specified columns for a perfect match
                for col in duplicate_check_columns:
                    # Logic to handle different scenarios of matching, including NaN values
                    if col in row_reviewed and col in row_existing and pd.notna(row_reviewed[col]) and pd.notna(row_existing[col]) and row_reviewed[col] != row_existing[col]:
                        match = False
                        break
                    elif (col in row_reviewed and col not in row_existing) or \
                            (col not in row_reviewed and col in row_existing) or \
                            (col in row_reviewed and col in row_existing and pd.isna(row_reviewed[col]) and pd.notna(row_existing[col])) or \
                            (col in row_reviewed and col in row_existing and pd.notna(row_reviewed[col]) and pd.isna(row_existing[col])):
                        match = False
                        break
                if match:
                    is_duplicate = True
                    break

            # If no duplicate was found after checking the entire existing bank, add the question
            if not is_duplicate:
                df_existing = pd.concat([df_existing, pd.DataFrame([row_reviewed])], ignore_index=True)
                added_count += 1

        return added_count, df_existing



[documentos]
    def save_dataframe_to_excel(
            self,
            df: pd.DataFrame,
            filepath: str,
            overwrite: bool = True,
            new_suffix: str = '_actualizado'
    ) -> Optional[str]:
        """
        Saves a pandas DataFrame to an Excel file.

        Args:
            df (pd.DataFrame): The DataFrame to save.
            filepath (str): The destination Excel file path.
            overwrite (bool, optional): If True, overwrites the existing file.
                                         If False, saves to a new file with a suffix.
                                         Defaults to True.
            new_suffix (str, optional): The suffix to add to the filename if
                                         overwrite is False. Defaults to '_actualizado'.

        Returns:
            Optional[str]: The path of the file where the data was saved, or None if there was an error.
        """
        try:
            if overwrite:
                df.to_excel(filepath, index=False)
                return filepath
            else:
                base, ext = os.path.splitext(filepath)
                new_filepath = f"{base}{new_suffix}{ext}"
                df.to_excel(new_filepath, index=False)
                return new_filepath
        except Exception as e:
            print(f"Error al guardar el archivo Excel: {e}")
            return None



[documentos]
    def generate_excel_from_docx(
            self,
            docx_path: str,
            excel_output_filename: str,
            output_dir: Optional[str] = None
    ) -> Optional[str]:
        """
        Generates an XLSX file from a revised DOCX file. This is a convenience
        wrapper around `read_questions_from_docx` and `save_questions_to_excel`.

        Args:
            docx_path (str): The path to the input DOCX file.
            excel_output_filename (str): The filename for the output XLSX file.
            output_dir (Optional[str]): The directory to save the output file.
                                    If None, saves it in the same directory as the input docx_path.

        Returns:
            Optional[str]: The path to the saved XLSX file if the operation is successful, otherwise None.
        """

        if output_dir is None:
            output_dir = os.path.dirname(docx_path)

        os.makedirs(output_dir, exist_ok=True)

        full_output_path = os.path.join(output_dir, excel_output_filename)

        df_from_docx: Optional[pd.DataFrame] = self.read_questions_from_docx(docx_path)
        if df_from_docx is not None:
            saved_path = self.save_dataframe_to_excel(df_from_docx, full_output_path, overwrite=True)
            if saved_path:
                print(f"Archivo Excel generado en: {saved_path}")
                return saved_path
        else:
            print("No se pudo leer el archivo DOCX o no se generó el DataFrame.")
        return None



[documentos]
    def add_reviewed_questions_to_existing_excel(
            self,
            existing_excel_path: str,
            reviewed_excel_path: str
    ) -> Optional[str]:
        """
        Adds questions from a reviewed XLSX file to another existing XLSX file,
        avoiding duplication. This is a high-level wrapper function.

        Args:
            existing_excel_path (str): The path to the existing question bank XLSX file.
            reviewed_excel_path (str): The path to the XLSX file containing the revised questions.

        Returns:
            Optional[str]: The path to the updated XLSX file if the operation is successful, otherwise None.
        """
        added_count, updated_df = self.add_questions_without_duplicates(
            existing_excel_path, reviewed_excel_path
        )

        if added_count >= 0:
            print(f"Se añadieron {added_count} preguntas sin duplicados.")
            # Save the updated DataFrame to a new file
            save_path: str = "banco_de_preguntas_actualizado.xlsx"
            self.save_dataframe_to_excel(updated_df, save_path, overwrite=False)
            print(f"Banco de preguntas actualizado guardado en: {save_path}")
            return save_path
        else:
            print("Ocurrió un error al añadir las preguntas.")
        return None



## Example of QuestionBankManager usage

# # Create an instance of the class
# manager = QuestionBankManager()
#
# # Example usage: Generate Excel from DOCX
# docx_file = "preguntas_revisadas.docx"  # Replace with your DOCX file
# excel_file = "preguntas_generadas.xlsx"
# generated_excel_path = manager.generate_excel_from_docx(docx_file, excel_file)
# if generated_excel_path:
#     print(f"Archivo Excel generado: {generated_excel_path}")
#
# # Example usage: Add questions to an existing Excel
# existing_excel = "banco_existente.xlsx"  # Replace with your existing Excel file
# reviewed_excel = "preguntas_generadas.xlsx"  # Using the previously generated file
# updated_excel_path = manager.add_reviewed_questions_to_existing_excel(
#     existing_excel, reviewed_excel
# )
# if updated_excel_path:
#     print(f"Archivo Excel actualizado: {updated_excel_path}")