const isIlluminaFormat = (filename: string): boolean => {
    const filenameParts = filename.split('_');
    if (filenameParts.length < 5) return false;
    const regex = new RegExp(/^[a-zA-Z0-9-_.]+_S[1-9][0-9]{0,7}_L00[1-4]_R[1-2]_001.fastq.gz$/);
    const regexResult = regex.test(filename);
    return regexResult;
};

const isNCBIGeoFormat = (filename: string, sequencing_read_type: string): boolean => {
    const filenameParts = filename.split('_');
    if (sequencing_read_type === 'paired') {
        if (filenameParts.length !== 2) return false;
        const regex = new RegExp(/^SRR[1-9][0-9]{3,7}_[1-2].fastq.gz$/);
        const regexResult = regex.test(filename);
        return regexResult;
    }
    if (sequencing_read_type === 'singled') {
        if (filenameParts.length !== 1) return false;
        const regex = new RegExp(/^SRR[1-9][0-9]{3,7}.fastq.gz$/);
        const regexResult = regex.test(filename);
        return regexResult;
    }
    return false;
};

/**
 * Given a list of rows from a CSV file. The list might repeat sample
 * information in multiple rows, once for every file associated with a sample.
 * A row will have values for the given column names. This function creates a
 * list of rows where the filenames are removed and the samples are unique.
 * Columns are only kept if they have multiple different values across the
 * rows.
 *
 * @return list of objects with properties SampleName and the column names that had multiple values
 */
export const parseValidSamples = (csvrows: any[]): any[] => {
    // csvrows[0] is the first data row. A row is parsed as an object with
    // keys as column names.
    const columns = Object.keys(csvrows[0]);
    // A "category" is a column name. Specifically, besides the standard
    // ones (samplename, filename), the categories will be the variables
    // that change between different measurements. See the group options
    // etc.
    const categoriesWithMultipleValues: string[] = [];
    columns.forEach((column: string) => {
        if (column === 'SampleName' || column === 'Filename') return;
        const uniqueValuesInColumn: Set<string> = new Set(csvrows.map((row: any) => row[column]));
        if (uniqueValuesInColumn.size > 1) {
            categoriesWithMultipleValues.push(column);
        }
    });

    // Sample rows are duplicated for each file. Grab the first row for
    // each unique sample name. Of a row, only keep the categories with
    // multiple values as well as the sample's name.
    const rawFirstOfEachUniqueSample: any[] = [];
    for (const row of csvrows) {
        if (rawFirstOfEachUniqueSample.some((obj: any) => obj.SampleName === row.SampleName)) continue;
        rawFirstOfEachUniqueSample.push(row);
    }

    const firstOfEachUniqueSample = rawFirstOfEachUniqueSample.map((row: any) => {
        const filteredRow: Record<string, string> = {};
        categoriesWithMultipleValues.forEach((column: string) => {
            filteredRow[column] = row[column];
        });
        filteredRow.SampleName = row.SampleName;
        return filteredRow;
    });

    return firstOfEachUniqueSample;
};

/**
 * Given a list of rows from a CSV file and a list of fastq files, this
 * function returns a dictionary where the keys are sample names and the
 * values are lists of filenames that are valid and exist in the fastq files.
 *
 * @param sequencing_read_type paired or singled, needed for the filename format
 * @return a dictionary where the keys are sample names and the values are lists of filenames
 */
export const parseValidFiles = (csvrows: any[], fastq_files: any[], sequencing_read_type: string): Record<string, string[]> => {
    const rowsWithUniqueValidExistingFile: any[] = [];
    for (const row of csvrows) {
        // If the filename is already in the list, skip the row.
        if (rowsWithUniqueValidExistingFile.some((obj: any) => obj.Filename === row.Filename)) continue;
        // If a corresponding file is not in the fastq_files list, skip the row.
        if (!fastq_files.some((file: any) => file.name === row.Filename)) continue;
        // If filename format is invalid, skip the row.
        if (!(isIlluminaFormat(row.Filename) || isNCBIGeoFormat(row.Filename, sequencing_read_type))) continue;

        rowsWithUniqueValidExistingFile.push(row);
    }

    const sampleNameToValidFilenames: Record<string, string[]> = {};
    for (const row of rowsWithUniqueValidExistingFile) {
        if (!sampleNameToValidFilenames[row.SampleName]) {
            sampleNameToValidFilenames[row.SampleName] = [];
        }
        sampleNameToValidFilenames[row.SampleName].push(row.Filename);
    }

    return sampleNameToValidFilenames;
};

/**
 * Keeping this one for now in case of any issues with the new parser or in
 * case we need to compare expected outputs.
 */
/* istanbul ignore next */
const _oldParser = (pageState: any, sequencing_read_type: string) => {
    const contents = pageState.csv!.contents;
    const columns = Object.keys(contents[0]);
    const uniqueValuesByCategoryWithDeletions: {
        [key: string]: string[];
    } = {};
    const uniqueValuesByCategory: {
        [key: string]: string[];
    } = {};
    columns.forEach((column: string) => {
        uniqueValuesByCategory[column] = Array.from(new Set(contents.map((row: any) => row[column])));
        uniqueValuesByCategoryWithDeletions[column] = Array.from(new Set(contents.map((row: any) => row[column])));
    });
    const sampleNames = uniqueValuesByCategory['SampleName'];
    const filenames = uniqueValuesByCategory['Filename'];

    // array with a list of categories (as strings) with more than one value. Just get the list of category names
    const categoriesWithMoreThanOneValue: any = [];
    for (const category in uniqueValuesByCategory) {
        if (uniqueValuesByCategory[category].length > 1) {
            categoriesWithMoreThanOneValue.push(category);
        }
    }
    // if category in uniqueValuesByCategoryWithDeletions has only one value, delete it from uniqueValuesByCategoryWithDeletions
    for (const category in uniqueValuesByCategoryWithDeletions) {
        if (uniqueValuesByCategoryWithDeletions[category].length === 1) {
            delete uniqueValuesByCategoryWithDeletions[category];
        }
    }

    delete uniqueValuesByCategoryWithDeletions['Filename'];
    delete uniqueValuesByCategoryWithDeletions['SampleName'];

    const filenameDedupedSamples: any = [];
    for (const row of contents) {
        if (filenameDedupedSamples.some((obj: any) => obj.Filename === row.Filename)) continue;
        filenameDedupedSamples.push(row);
    }

    const dedupedAndCleanedSampleFiles = filenameDedupedSamples.filter((row: any) => {
        return (
            pageState.fastq_files.some((file: any) => file.name === row.Filename) &&
            (isIlluminaFormat(row.Filename) || isNCBIGeoFormat(row.Filename, sequencing_read_type))
        );
    });

    const dedupedAndCleanedFilesBySample: any = {};
    for (const row of dedupedAndCleanedSampleFiles) {
        if (!dedupedAndCleanedFilesBySample[row.SampleName]) {
            dedupedAndCleanedFilesBySample[row.SampleName] = [];
        }
        dedupedAndCleanedFilesBySample[row.SampleName].push(row.Filename);
    }

    const dedupedAndCleanedFilesBySampleWithEmptyArrays: any = {};
    // add empty arrays for samples that don't have any files
    for (const sampleName of uniqueValuesByCategory['SampleName']) {
        if (!dedupedAndCleanedFilesBySample[sampleName]) {
            dedupedAndCleanedFilesBySampleWithEmptyArrays[sampleName] = [];
        } else {
            dedupedAndCleanedFilesBySampleWithEmptyArrays[sampleName] = dedupedAndCleanedFilesBySample[sampleName];
        }
    }

    const rawFirstOfEachUniqueSample: any[] = [];
    for (const row of contents) {
        if (rawFirstOfEachUniqueSample.some((obj: any) => obj.SampleName === row.SampleName)) continue;
        // delete row["Filename"]
        rawFirstOfEachUniqueSample.push(row);
    }

    // remove unused columns
    const firstOfEachUniqueSample = rawFirstOfEachUniqueSample.map((row: any) => {
        const columns = Object.keys(row);
        columns.forEach((column: string) => {
            if (!categoriesWithMoreThanOneValue.includes(column)) {
                delete row[column];
            }
        });
        return row;
    });

    // create an array with all values from all categories in the uniqueValuesByCategoryWithDeletions object
    const allValuesFromAllCategories: any[] = [];
    for (const category in uniqueValuesByCategoryWithDeletions) {
        allValuesFromAllCategories.push(...uniqueValuesByCategoryWithDeletions[category]);
    }

    // list of files from dedupedAndCleanedFilesBySample
    const filesFromDedupedAndCleanedFilesBySample: any[] = [];
    for (const sampleName in dedupedAndCleanedFilesBySample) {
        filesFromDedupedAndCleanedFilesBySample.push(...dedupedAndCleanedFilesBySample[sampleName]);
    }

    return {
        ...pageState,
        values_by_sample: firstOfEachUniqueSample,
        files_list: dedupedAndCleanedFilesBySample
    };
};

interface RequiredColumn {
    name: string;
    type: {
        name: string;
        required: boolean;
        columnIllegalChars: string[];
        dataIllegalChars: string[];
    };
}

/**
 * Validates a CSV file to ensure it meets the required format and data integrity.
 * This function checks for the presence of required columns, illegal characters in column names,
 * and illegal characters in the data cells. It returns a boolean indicating whether the CSV is valid or not.
 *
 * @param csv
 * @returns true if the CSV file is valid, false otherwise
 *
 * This function performs the following checks:
 * 1. Checks if the required columns are present in the CSV file.
 * 2. Checks for illegal characters in the column names.
 * 3. Checks for illegal characters in the data cells of each column.
 * 4. Logs errors to the console for any invalid headers or data.
 * 5. Alerts the user if the CSV file is not valid.
 */
export const validateCSV = (csv: {file: File; contents: Record<string, any>[]}): boolean => {
    const columnIllegalChars: string[] = [',', ';', ':', ' ', '.', '/', '\\', '|', '?', '<', '>', '[', ']', '{', '}', '(', ')'];
    const dataIllegalChars: string[] = [];
    const _requiredColumns: RequiredColumn[] = [
        {
            name: 'SampleName',
            type: {
                name: 'string',
                required: true,
                columnIllegalChars: columnIllegalChars,
                dataIllegalChars: []
            }
        },
        {
            name: 'Filename',
            type: {
                name: 'string',
                required: true,
                columnIllegalChars: columnIllegalChars,
                dataIllegalChars: []
            }
        }
    ];

    if (csv.contents.length === 0) {
        console.error('CSV file is empty.');
        return false;
    }

    const columns = Object.keys(csv.contents[0]);

    if (columns.length === 0) {
        // If there are no columns in the CSV file, it's invalid.
        console.error('CSV file has no columns.');
        return false;
    }

    // Validate the CSV file against the required columns
    const columnsValid: boolean = _requiredColumns.every(col => {
        // Validate headers and check for illegal chars in column names
        const header = col.name;
        const headerValid = columns.includes(header) && !col.type.columnIllegalChars.some(char => header.includes(char));
        if (!headerValid) {
            console.error(`Invalid header: ${header}`);
            return false;
        }

        // Validate data in the column
        const dataValid: boolean = csv.contents.every((row: any) => {
            const cellValue = row[header];
            if (col.type.required && (cellValue === undefined || cellValue === null || cellValue === '')) {
                console.error(`Missing required value in column ${header}`);
                return false;
            }
            return !col.type.dataIllegalChars.some(char => cellValue && cellValue.toString().includes(char));
        });
        if (!dataValid) {
            console.error(`Invalid data in column: ${header}`);
        }
        return headerValid && dataValid;
    });

    // Make sure no additional columns have illegal characters in their names
    const additionalColumnsValid: boolean = columns.every(col => {
        const isRequiredColumn = _requiredColumns.some(requiredCol => requiredCol.name === col);

        if (!isRequiredColumn) {
            // Check for illegal characters in column name
            if (columnIllegalChars.some(char => col.includes(char))) {
                console.error(`Invalid column name: ${col}. Contains illegal characters.`);
                return false;
            }

            // Check cell values in additional column
            const isValid = csv.contents.every((row: any) => {
                const cellValue = row[col];
                const isNumeric = !isNaN(cellValue);
                // If it's not numeric, check for illegal characters
                if (!isNumeric && cellValue && typeof cellValue === 'string') {
                    return !dataIllegalChars.some(char => cellValue.includes(char));
                }
                return true;
            });

            if (!isValid) {
                console.error(`Invalid data in additional column: ${col}`);
                return false;
            }
        }

        return true;
    });

    if (!columnsValid || !additionalColumnsValid) {
        console.error('CSV file failed validation checks.');
        return false;
    }

    // If all validations pass, return true
    // This indicates that the CSV file is valid and can be processed further.
    return true;
};
