symnmf.c · SymNMF-Clustering

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include "symnmf.h"

#define BETA 0.5
#define EPSILON 1e-4
#define SMALL_NUMBER 1e-13
#define MAX_ITER 300

/**
 * @brief Transposes a given matrix.
 *
 * @param mat Input matrix (flattened).
 * @param n Number of rows.
 * @param k Number of columns.
 * @return Pointer to the transposed matrix, or NULL if memory allocation fails.
 */
static double *transpose_matrix(const double *mat, int n, int k){
    int i, j;
    double *mat_t = malloc((size_t)n*k*sizeof(double));
    if(!mat_t) return NULL;
    for(i=0; i<n; i++){
        for(j=0; j<k; j++){
            mat_t[(j*n)+i] = mat[(i*k)+j];
        }
    }
    return mat_t;
}

/**
 * @brief Multiplies two matrices.
 *
 * @param matA First matrix.
 * @param matB Second matrix.
 * @param rowsA Number of rows in matA.
 * @param cols_rows Number of columns in matA and rows in matB.
 * @param colsB Number of columns in matB.
 * @return A pointer to product matrix, or NULL if memory allocation fails.
 */
static double *mat_mult(const double *matA, const double *matB, int rowsA, int cols_rows, int colsB){
    int i, j, k;
    double sum;
    double *prod = malloc(rowsA * colsB * sizeof(double));
    if(!prod) return NULL;
    for(i=0; i<rowsA; i++){
        for(j=0; j<colsB; j++){
            sum=0.0;
            for(k=0; k<cols_rows; k++){
                sum += matA[(i*cols_rows) + k]*matB[(k*colsB) + j];
            }
            prod[(i*colsB) + j] = sum;
        }
    }
    return prod;
}

/**
 * @brief Checks for convergence of the H matrix.
 *
 * @param h_new New H matrix.
 * @param h_old Previous H matrix.
 * @param n Number of rows.
 * @param k Number of columns.
 * @return 1 iff converged, 0 otherwise.
 */
static int check_convergence(const double *h_new, const double *h_old, int n, int k){
    int i, j;
    double sum = 0.0;
    for(i=0; i<n; i++){
        for(j=0; j<k; j++){
            sum += pow(h_new[(i*k) + j] - h_old[(i*k) + j], 2);
        }
    }
    return sum < EPSILON;
}

/**
 * @brief Calculates the matrix product H*Hᵀ*H.
 *
 * @param mat Input matrix.
 * @param n Number of rows.
 * @param k Number of columns.
 * @return A pointer to the  product matrix, or NULL if memory allocation fails.
 */
static double *mult_transpose(const double *mat, int n, int k){
    double *transpose, *res, *temp;
    transpose = transpose_matrix(mat, n, k);
    if(!transpose) return NULL;
    temp = mat_mult(transpose, mat, k, n, k);
    if(!temp){
        free(transpose);
        return NULL;
    }
    res = mat_mult(mat, temp, n, k, k);
    free(transpose);
    free(temp);
    return res;
}

/**
 * @brief Creates a new H matrix using the iterative update rule.
 *
 * @param old_h Previous H matrix.
 * @param w_matrix  Norm matrix.
 * @param n Number of rows (& columns in the norm matrix).
 * @param k Number of columns in H.
 * @return A pointer to the new H matrix, or NULL if memory allocation fails.
 */
static double *create_h_new(const double *old_h, const double *w_matrix, int n, int k){
    int i, j;
    double inside;
    double *h_matrix, *mone, *mehane;
    h_matrix = malloc((size_t)n*k*sizeof(double));
    if(!h_matrix) return NULL;
    mone = mat_mult(w_matrix, old_h, n, n, k);
    if(!mone){
        free(h_matrix);
        return NULL;
    }
    mehane = mult_transpose(old_h, n, k);
    if(!mehane){
        free(h_matrix);
        free(mone);
        return NULL;
    }
    for(i=0; i<n; i++){ /*calc the new H matrix based on the numerator and denominator calculated before, and BETA.*/
        for(j=0; j<k; j++){
            inside = (mone[(i*k) + j])/(mehane[(i*k) + j] + SMALL_NUMBER); /*SMALL_NUMBER is used to make sure no division by zero is done.*/
            h_matrix[(i*k) + j] = old_h[(i*k) + j]*((1.0-BETA)+(BETA*inside));
        }
    }
    free(mone);
    free(mehane);
    return h_matrix;
}

/**
 * @brief Iteratively updates the H matrix until convergence or max iterations are reached.
 *
 * @param h_matrix Initial H matrix.
 * @param w_matrix Norm matrix.
 * @param n Number of rows (& columns in the norm matrix).
 * @param k Number of columns in H.
 * @return The converged H matrix, or NULL if memory allocation fails.
 */
double *converge_h(double *h_matrix, const double *w_matrix, int n, int k){
    int i, res;
    double *old_h;
    for(i=0; i<MAX_ITER; i++){
        old_h = h_matrix;
        h_matrix = create_h_new(old_h, w_matrix, n, k);
        if(!h_matrix){
            free(old_h);
            return NULL;
        }
        res = check_convergence(h_matrix, old_h, n, k);
        free(old_h);
        if(res) break;
    }
    return h_matrix;
}

/**
 * @brief Builds the normalized similarity matrix W.
 *
 * @param d_matrix Diagonal Degree matrix.
 * @param a_matrix Similarity matrix.
 * @param n Number of data points.
 * @return A pointer to the norm matrix, or NULL if memory allocation fails.
 */
double *build_w(const double *d_matrix, const double *a_matrix, int n){
    int i,j;
    double d_i, d_j;
    double *w = malloc((size_t)n*n*sizeof(double));
    if(!w) return NULL;
    for(i=0; i<n;i++){
        d_i = d_matrix[i] != 0.0 ? 1.0/sqrt(d_matrix[i]) : 0.0;
        for(j=0;j<n;j++){
            d_j = d_matrix[j] != 0.0 ? 1.0/sqrt(d_matrix[j]) : 0.0;
            w[(i*n)+j] = d_i*a_matrix[i*n+j]*d_j; /*smart matrix multiplication of symmetric and diagonal matrices*/
        }
    }
    return w;
}

/**
 * @brief Computes the similarity value between two data points.
 *
 * @param point1 First point.
 * @param point2 Second point.
 * @param d Dimension of the points.
 * @return The value for the similarity matrix entry.
 */
static double sym_val(const double *point1, const double *point2, int d){   
    int i;
    double e_exponent=0.0, sum=0.0;
    double temp;
    
    for(i=0; i<d; i++){
       temp =  point1[i] - point2[i];
       sum += pow(temp,2);
    }
    e_exponent = (-0.5)*sum;
    return (exp(e_exponent));
}

/**
 * @brief Builds the similarity matrix A.
 *
 * @param p_matrix Flattened 1D array of size n*d of the data points.
 * @param n Number of data points.
 * @param d Dimension of the data points.
 * @return A pointer to the similarity matrix, or NULL if memory allocation fails.
 */
double *build_a(const double *p_matrix, int n, int d){
    int i, j;
    double sym_v;
    double *a = malloc((size_t)n*n*sizeof(double));
    if(!a) return NULL;
    for(i=0; i<n;i++){
        for(j=0;j<i;j++){ /*calc sym value only under the diag*/
            sym_v = sym_val(&p_matrix[i * d], &p_matrix[j * d], d); 
            a[(i*n) + j] = sym_v; 
            a[(j*n) + i] = sym_v; /*a is symmetric*/
        } 
        a[(i*n) + i] = 0.0;  /*diag is all 0's */
    }  
    return a;      
    
}

/**
 * @brief Builds the diagonal degree matrix D (diagonal, size n).
 *
 * @param a_matrix The similarity matrix.
 * @param n Number of data points.
 * @return A pointer to a 1D array of the diagonal elements of D, or NULL if memory allocation fails.
 */
double *build_d(const double *a_matrix, int n){
    int i, j;
    double d_i;
    double *d = malloc((size_t)n*sizeof(double));
    if(!d) return NULL;
    for(i=0; i<n;i++){
        d_i = 0.0;
        for(j=0;j<n;j++){ 
            d_i+=a_matrix[(i*n) + j];
        } 
        d[i] = d_i;  
    }  
    return d;
}

/**All functions responsible for running the file as a standalone.
 * when compiling, need to use -DBUILD_STANDALONE to make sure it's included
*/
#ifdef BUILD_STANDALONE

/**
 * @brief Compute a matrix on input data based on the specified goal.
 *
 * @param p_matrix Pointer to a flattened matrix of size n*d.
 * @param n Number of data points.
 * @param d Dimension of the data points.
 * @param goal A string indicating the matrix to compute: "sym", "ddg", or "norm".
 * @return Pointer to the resulting matrix (flattened), or NULL if memory allocation fails.
 */
static double *calc_mat(double *p_matrix, int n, int d, char *goal){
    double *a_mat, *d_mat, *w_mat;
    a_mat = build_a(p_matrix, n, d);
    if(!a_mat) return NULL;
    if(strcmp(goal, "sym") == 0) return a_mat;
    d_mat = build_d(a_mat, n);
    if(!d_mat){
        free(a_mat);
        return NULL;
    }
    if(strcmp(goal, "ddg") == 0){
        free(a_mat);
        return d_mat;
    }
    w_mat = build_w(d_mat, a_mat, n);
    free(a_mat);
    free(d_mat);
    if(!w_mat) return NULL;
    return w_mat;
}

/**
 * @brief Print matrix to standard output.
 *
 * @param mat Matrix to be printed (as flattened 1D array).
 * @param numRows Number of rows.
 * @param numCols Number of columns.
 */
static void print_matrix(double *mat, int numRows, int numCols){
    int i, j;
    for(i=0; i<numRows;i++){
        for(j=0;j<numCols;j++){ 
            printf("%.4f",mat[(i*numCols) + j]);
            if (j<numCols-1)
            {
                printf(",");
            }
        } 
        printf("\n");   
    }
}

/**
 * @brief Prints the Diagonal Degree matrix given its diagonal elements.
 * @param d_mat A 1D array containing the diagonal elements of D.
 * @param n Number of data points.
 */
static void print_d(double *d_mat, int n){
    int i, j;
    for(i=0; i<n;i++){
        for(j=0;j<n;j++){ 
            if(i==j) printf("%.4f",d_mat[i]);
            else printf("0.0000");
            if (j<n-1) printf(",");
        } 
        printf("\n");   
    }
}

/**
 * @brief Parse input file to determine number of points and their dimensions.
 *
 * @param fp File pointer to the input data file.
 * @param n Pointer to an integer where the number of data points will be stored.
 * @param d Pointer to an integer where the dimension of the data points will be stored.
 */
static void get_n_d(FILE *fp, int *n, int *d){
    int firstline=1;
    double p;
    char c;
    *n=0;
    *d=0;
    while (fscanf(fp, "%lf%c", &p, &c) == 2)
    {
        if(firstline) (*d)++;
        if(c == '\n'){
            (*n)++;
            firstline=0;
        }
        
    }
}

/**
 * @brief Reads data points from a file into a flat matrix.
 *
 * @param fp File pointer to the input data file.
 * @param n Number of data points to read.
 * @param d Dimension of the data points to read.
 * @return A pointer to the flat matrix of data points, or NULL if memory allocation or file reading fails.
 */
static double *read_from_file(FILE *fp, int n, int d){
    double *p_matrix;
    int i, j;
    p_matrix = malloc((size_t)n*d*sizeof(double));
    if(!p_matrix) return NULL; 
    for(i=0; i<n; i++){
        for(j=0;j<d;j++){
            if(fscanf(fp, "%lf", &p_matrix[(i*d) + j]) != 1){
                free(p_matrix);
                return NULL;
            }
            if (j< d-1) fgetc(fp);
        }
        fgetc(fp);
    }
    return p_matrix;
}

int main(int argc, char **argv){
    double *p_mat, *end_mat;
    int n, d;
    FILE *fp;
    (void)argc;
    fp = fopen(argv[2], "r");
    if(!fp) goto error_case;
    get_n_d(fp, &n, &d);
    rewind(fp); /*make sure we start reading from the beginning of the file*/
    p_mat = read_from_file(fp, n, d);
    if(!p_mat) goto error_case;
    end_mat = calc_mat(p_mat, n, d, argv[1]);
    free(p_mat);
    if(end_mat){
        if(strcmp(argv[1], "ddg") == 0) print_d(end_mat, n);
        else print_matrix(end_mat, n, n);
        free(end_mat);
        fclose(fp);
        return 0;
    }
error_case:
    printf("An Error Has Occurred\n");
    if(fp) fclose(fp);
    return 1;
}
#endif