ClusterMain.cpp
上传用户:szb0815
上传日期:2007-06-13
资源大小:338k
文件大小:150k
- /*
- Software and source code Copyright (C) 1998-2000 Stanford University
- Written by Michael Eisen (eisen@genome.stanford.edu)
- This software is copyright under the following conditions:
- Permission to use, copy, and modify this software and its documentation
- is hereby granted to all academic and not-for-profit institutions
- without fee, provided that the above copyright notice and this permission
- notice appear in all copies of the software and related documentation.
- Permission to distribute the software or modified or extended versions
- thereof on a not-for-profit basis is explicitly granted, under the above
- conditions. However, the right to use this software in conjunction with
- for profit activities, and the right to distribute the software or modified or
- extended versions thereof for profit are *NOT* granted except by prior
- arrangement and written consent of the copyright holders.
- Use of this source code constitutes an agreement not to criticize, in any
- way, the code-writing style of the author, including any statements regarding
- the extent of documentation and comments present.
- The software is provided "AS-IS" and without warranty of ank kind, express,
- implied or otherwise, including without limitation, any warranty of
- merchantability or fitness for a particular purpose.
- In no event shall Stanford University or the authors be liable for any special,
- incudental, indirect or consequential damages of any kind, or any damages
- whatsoever resulting from loss of use, data or profits, whether or not
- advised of the possibility of damage, and on any theory of liability,
- arising out of or in connection with the use or performance of this software.
- This code was written using Borland C++ Builder 4 (Inprise Inc., www.inprise.com)
- and may be subject to certain additional restrictions as a result.
- */
- //---------------------------------------------------------------------------
- #include <vcl.h>
- #pragma hdrstop
- #include "ClusterMain.h"
- #include <math.h>
- #include <dir.h>
- #include "FileFormat.h"
- #include "ClusterAbout.h"
- #include "ShellApi.h"
- #include "NumericalRecipes.h"
- #include "MyUtils.h"
- //---------------------------------------------------------------------------
- #pragma package(smart_init)
- #pragma resource "*.dfm"
- /* This module contains the handling code for the main form of the program
- as well as most of the computational routines. This is not the most elegant
- way to have done this, but this software was written on the fly and did not
- evolve in an entirely coherent manner. I have tried to organize things
- logically. -MBE 2/2000 */
- TMainForm *MainForm;
- //---------------------------------------------------------------------------
- //---------------------------------------------------------------------------
- /* A node class TNode is defined here. This was put in mostly for future
- versions of the program and isn't used extensively here */
- __fastcall TNode::TNode() : TObject()
- {
- IsNode = false;
- Child1 = NULL;
- Child2 = NULL;
- }
- __fastcall TNode::~TNode()
- {
- if (IsNode)
- {
- delete Child1;
- delete Child2;
- }
- }
- /* Constructer for main form */
- __fastcall TMainForm::TMainForm(TComponent* Owner)
- : TForm(Owner)
- {
- /* This allows drag-and-drop of files to work by sending all windows messages
- to the handler routine AppMessage */
- Application->OnMessage = AppMessage;
- GeneMetricComboBox->ItemIndex = 0;
- ArrayMetricComboBox->ItemIndex = 0;
- /* Current version stored here; checked against master website to alter
- user for updates */
- AnsiString Version = "2.10";
- try
- {
- NMHTTP1->Get("http://rana.stanford.edu/software/clusterversion.html");
- }
- catch (Exception &E)
- {
- }
- if (Version != (AnsiString(NMHTTP1->Body)).SubString(0,Version.Length()) )
- {
- Caption = "Gene Cluster - Check for update at http://rana.stanford.edu/software";
- }
- else
- {
- Caption = "Gene Cluster";
- }
- /* Load in data from registry */
- TRegistry *Registry = new TRegistry();
- Registry->RootKey = HKEY_CURRENT_USER;
- Registry->OpenKey("Software\Stanford\Cluster\WeightSettings",true);
- try
- {
- GeneWeightCutoff = Registry->ReadFloat("GeneWeightCutoff");
- }
- catch (Exception &E)
- {
- GeneWeightCutoff = 0.9;
- }
- try
- {
- GeneWeightExp = Registry->ReadFloat("GeneWeightExp");
- }
- catch (Exception &E)
- {
- GeneWeightExp = 1.0;
- }
- try
- {
- ArrayWeightCutoff = Registry->ReadFloat("ArrayWeightCutoff");
- }
- catch (Exception &E)
- {
- ArrayWeightCutoff = 0.9;
- }
- try
- {
- ArrayWeightExp = Registry->ReadFloat("ArrayWeightExp");
- }
- catch (Exception &E)
- {
- ArrayWeightExp = 1.0;
- }
- Registry->CloseKey();
- Registry->OpenKey("Software\Stanford\Cluster\Directory",true);
- try
- {
- LoadFileDialog->InitialDir = Registry->ReadString("LastOpenDirectory");
- }
- catch (Exception &E)
- {
- }
- Registry->CloseKey();
- delete Registry;
- GeneWeightCutoffEdit->Text = GeneWeightCutoff;
- GeneWeightExpEdit->Text = GeneWeightExp;
- ArrayWeightCutoffEdit->Text = ArrayWeightCutoff;
- ArrayWeightExpEdit->Text = ArrayWeightExp;
- DragAcceptFiles(Handle,TRUE);
- Rows = 0;
- Columns = 0;
- /* Default Filtering Parameters */
- FilterPercentVal = 80.0;
- FilterPercentEdit->Text = FilterPercentVal;
- FilterSDVal = 2.0;
- FilterSDEdit->Text = FilterSDVal;
- FilterAbsValCount = 1;
- FilterAbsValCountEdit->Text = FilterAbsValCount;
- FilterAbsValVal = 2.0;
- FilterAbsValEdit->Text = FilterAbsValVal;
- FilterMaxMinVal = 2.0;
- FilterMaxMinEdit->Text = FilterMaxMinVal;
- /* Defaul SOM parameters */
- SOMGenesXDim = 1;
- SOMGenesYDim = 10;
- SOMArraysXDim = 1;
- SOMArraysYDim = 10;
- SOMGenesTau = 0.02;
- SOMArraysTau = 0.02;
- SOMGenesIterations = 100000;
- SOMArraysIterations = 20000;
- /* Default K-means parameters */
- GenesK = 10;
- ArraysK = 10;
- GMaxKCycles = 100;
- AMaxKCycles = 100;
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FormClose(TObject *Sender, TCloseAction &Action)
- {
- TRegistry *Registry = new TRegistry();
- Registry->RootKey = HKEY_CURRENT_USER;
- Registry->OpenKey("Software\Stanford\Cluster\WeightSettings",true);
- Registry->WriteFloat("GeneWeightCutoff",GeneWeightCutoff);
- Registry->WriteFloat("GeneWeightExp",GeneWeightExp);
- Registry->WriteFloat("ArrayWeightCutoff",ArrayWeightCutoff);
- Registry->WriteFloat("ArrayWeightExp",ArrayWeightExp);
- Registry->CloseKey();
- Registry->OpenKey("Software\Stanford\Cluster\Directory",true);
- char Drive[3];
- char Dir[260];
- fnsplit(LoadFileDialog->FileName.c_str(),Drive,Dir,NULL,NULL);
- AnsiString SaveDir = AnsiString(Drive) + AnsiString(Dir);
- Registry->WriteString("LastOpenDirectory",SaveDir);
- Registry->CloseKey();
- delete Registry;
- }
- void __fastcall TMainForm::FormDestroy(TObject *Sender)
- {
- int i;
- for (i=0;i<2*Rows-1;i++)
- {
- delete GeneData[i];
- delete GeneMask[i];
- }
- delete GeneData;
- delete GeneMask;
- delete Headers;
- delete IsData;
- delete InColumn;
- delete ArrayWeight;
- delete ArrayOrder;
- delete UniqID;
- delete GeneName;
- delete GeneWeight;
- delete GeneOrder;
- delete Use;
- }
- /* Enable Drag-Drop */
- void __fastcall TMainForm::FormDragDrop(TObject *Sender, TObject *Source,
- int X, int Y)
- {
- /* int i;
- i = 10;
- i = 10 * 10; */
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FormDragOver(TObject *Sender, TObject *Source,
- int X, int Y, TDragState State, bool &Accept)
- {
- Accept = true;
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::AppMessage(tagMSG &Msg, bool &Handled)
- {
- if (Msg.message == WM_DROPFILES)
- {
- char lpszFile[256];
- DragQueryFile((HANDLE) Msg.wParam, 0, lpszFile, sizeof(lpszFile));
- LoadFile(AnsiString(lpszFile));
- Handled = true;
- }
- /* DragFinish((HANDLE) wParam); */
- /* for all other messages, Handled remains False so that other message handlers can respond */
- }
- /* Load in data from tab-delimited text file */
- void __fastcall TMainForm::LoadFileButtonClick(TObject *Sender)
- {
- // User will select a data file (*.txt)
- if (LoadFileDialog->Execute())
- {
- char Drive[3];
- char Dir[260];
- fnsplit(LoadFileDialog->FileName.c_str(),Drive,Dir,NULL,NULL);
- LoadFileDialog->InitialDir = AnsiString(Drive) + AnsiString(Dir);
- /* Load in the file */
- LoadFile(LoadFileDialog->FileName);
- /* Update buttons to reflect presence of data */
- FilterResultsLabel->Visible = false;
- AcceptFilterButton->Visible = false;
- /* Update default SOM sizes */
- SOMGenesYDim = 1 + sqrt((double)Rows);
- SOMGenesYDimEdit->Text = SOMGenesYDim;
- SOMArraysYDim = 1 + sqrt((double)Columns);
- SOMArraysYDimEdit->Text = SOMArraysYDim;
- /* Clear filtering information */
- FileNameMemo->Lines->Clear();
- FileNameMemo->Lines->Add(LoadFileDialog->FileName);
- }
- }
- void __fastcall TMainForm::LoadFile(AnsiString FileName)
- {
- // Clear old Data
- int i,j,l;
- for (i=0;i<2*Rows-1;i++)
- {
- delete GeneData[i];
- delete GeneMask[i];
- GeneName[i] = "";
- UniqID[i] = "";
- }
- delete GeneData;
- delete GeneMask;
- delete Headers;
- delete IsData;
- delete InColumn;
- delete ArrayWeight;
- delete ArrayOrder;
- delete UniqID;
- delete GeneName;
- delete GeneOrder;
- delete GeneWeight;
- delete Use;
- // Load file into TStringList to parse it
- TStringList *List = new TStringList();
- try
- {
- List->LoadFromFile(FileName);
- // Extract file root
- fnsplit(FileName.c_str(),NULL,NULL,ClusterName,NULL);
- // Defaul JobName is root name of cluster file
- JobNameEdit->Text = ClusterName;
- int index;
- AnsiString Line, Field;
- int NameIndex, GeneWeightIndex, GeneOrderIndex;
- // This is list for individual lines of tab-delimited data
- TStringList *LineList = new TStringList();
- //First Parse Header Line
- //It is assumed that first line contains headers
- Headers = new TStringList();
- Line = List->Strings[0];
- while ((Field = NextString(&Line)) != "DONE")
- {
- Headers->Insert(0,Field);
- }
- //First Column is always the UniqueID
- UniqueID = Headers->Strings[0];
- // Look to see if NAME/DESCRIPTION, GWEIGHT/WEIGHT, or GORDER/ORDER
- // fields are present
- NameIndex = max(max(Headers->IndexOf("NAME"),Headers->IndexOf("DESC")),
- Headers->IndexOf("DESCRIPTION"));
- GeneWeightIndex = max(Headers->IndexOf("GWEIGHT"),Headers->IndexOf("WEIGHT"));
- GeneOrderIndex = max(Headers->IndexOf("GORDER"),Headers->IndexOf("ORDER"));
- IsData = new bool[Headers->Count];
- for (i=0;i<Headers->Count;i++)
- {
- IsData[i] = true;
- }
- Columns = Headers->Count; //Number of loaded columns
- // Will decrement columns and set IsData to false for non-data columns
- Columns--; //UniqueID Column always assumed
- IsData[0] = false;
- if (NameIndex > -1)
- {
- Columns--;
- IsData[NameIndex] = false;
- }
- if (GeneWeightIndex > -1)
- {
- Columns--;
- IsData[GeneWeightIndex] = false;
- }
- if (GeneOrderIndex > -1)
- {
- Columns--;
- IsData[GeneOrderIndex] = false;
- }
- index = 0; // Position in list of data columns
- InColumn = new int[Columns]; // Column in Input file of each data column
- for (i=0;i<Headers->Count;i++)
- {
- if (IsData[i] == true)
- {
- InColumn[index] = i;
- index++;
- }
- }
- // Array Weight is weight for each column in clustering
- // Default is 1.0
- ArrayWeight = new double[2*Columns-1]; // allow for possibility of
- ArrayOrder = new double[2*Columns-1]; // clustering arrays
- for (i=0;i<Columns;i++)
- {
- ArrayWeight[i] = 1;
- ArrayOrder[i] = 1;
- }
- // Now Figure out if there is an Array Weight ROW
- Rows = List->Count - 1;
- int LineOffset = 1;
- Line = List->Strings[1];
- LineList->Clear();
- while ((Field = NextString(&Line)) != "DONE")
- {
- LineList->Insert(0,Field);
- }
- if ( (LineList->Strings[0] == "EWEIGHT") ||
- (LineList->Strings[0] == "WEIGHT") )
- {
- Rows--;
- LineOffset++;
- index = 0;
- for (j=0;j<min(LineList->Count,Headers->Count);j++)
- {
- if (IsData[j] == true)
- {
- try
- {
- double Val = LineList->Strings[j].ToDouble();
- ArrayWeight[index] = Val;
- }
- catch (EConvertError &E)
- {
- ArrayWeight[index] = 0; // Default to zero if row is
- // present but cell empty
- }
- index ++;
- }
- }
- }
- Line = List->Strings[2];
- LineList->Clear();
- while ((Field = NextString(&Line)) != "DONE")
- {
- LineList->Insert(0,Field);
- }
- if ( (LineList->Strings[0] == "EORDER") ||
- (LineList->Strings[0] == "ORDER") )
- {
- Rows--;
- LineOffset ++;
- index = 0;
- for (j=0;j<min(LineList->Count,Headers->Count);j++)
- {
- if (IsData[j] == true)
- {
- try
- {
- double Val = LineList->Strings[j].ToDouble();
- ArrayOrder[index] = Val;
- }
- catch (EConvertError &E)
- {
- ArrayOrder[index] = 0; // Default to zero if row is
- // present but cell empty
- }
- index ++;
- }
- }
- }
- GeneData = new double*[2*Rows-1];
- GeneMask = new bool*[2*Rows-1];
- UniqID = new AnsiString[2*Rows-1];
- GeneName = new AnsiString[2*Rows-1];
- GeneWeight = new double[2*Rows-1];
- GeneOrder = new double[2*Rows-1];
- for (i=0;i<2*Rows-1;i++)
- {
- GeneData[i] = new double[Columns];
- GeneMask[i] = new bool[Columns];
- for (j=0;j<Columns;j++)
- {
- GeneMask[i][j] = false;
- }
- }
- for (i=LineOffset;i<List->Count;i++)
- {
- LineList->Clear();
- l = i - LineOffset;
- //Update user on status
- StatusBar1->SimpleText = "Loading Gene " + AnsiString(l + 1) + " of "
- + AnsiString(List->Count - LineOffset);
- Application->ProcessMessages();
- //Get current line
- Line = List->Strings[i];
- //Parse Line into fields
- while ((Field = NextString(&Line)) != "DONE")
- {
- LineList->Insert(0,Field);
- }
- //UniqID always columns 0
- UniqID[l] = LineList->Strings[0];
- if (NameIndex > -1)
- {
- GeneName[l] = LineList->Strings[NameIndex];
- }
- else
- {
- GeneName[l] = UniqID[l];
- }
- if (GeneWeightIndex > -1)
- {
- try
- {
- GeneWeight[l] = LineList->Strings[GeneWeightIndex].ToDouble();
- }
- catch (EConvertError &E)
- {
- }
- }
- else
- {
- GeneWeight[l] = 1.0;
- }
- if (GeneOrderIndex > -1)
- {
- try
- {
- GeneOrder[l] = LineList->Strings[GeneOrderIndex].ToDouble();
- }
- catch (EConvertError &E)
- {
- }
- }
- else
- {
- GeneOrder[l] = 1.0;
- }
- index = 0;
- for (j=0;j<min(LineList->Count,Headers->Count);j++)
- {
- if (IsData[j] == true)
- {
- if (LineList->Strings[j].Length() > 0)
- {
- try
- {
- double Val = LineList->Strings[j].ToDouble();
- GeneData[l][index] = Val;
- GeneMask[l][index] = true;
- }
- catch (EConvertError &E)
- {
- }
- }
- index ++;
- }
- }
- }
- delete LineList;
- StatusBar1->SimpleText = "Done Loading Data";
- }
- catch (Exception &E)
- {
- Application->MessageBox("Could not Open FilenIf the file is open in another program (e.g. Excel)nClose it and try again","Could Not Open File",
- MB_OK);
- }
- delete List;
- }
- /* Tab-delimited parsing utility for LoadFile */
- AnsiString TMainForm::NextString(AnsiString* String)
- {
- AnsiString Field;
- int delim = (*String).LastDelimiter("t");
- if (delim != 0)
- {
- Field = (*String).SubString(delim+1,(*String).Length()-delim);
- (*String) = (*String).SubString(0,delim-1);
- }
- else
- {
- Field = (*String);
- (*String) = (*String).SubString(0,0);
- }
- if ((Field.Length() == 0) && (delim == 0) )
- {
- return AnsiString("DONE");
- }
- else
- {
- return Field;
- }
- }
- void __fastcall TMainForm::SaveData(AnsiString FileName)
- {
- TStringList *DataFile = new TStringList();
- AnsiString OutString = "";
- OutString += Headers->Strings[0] + AnsiString("t");
- OutString += AnsiString("NAME") + AnsiString("t");
- OutString += "GWEIGHT";
- TFloatFormat Format = ffGeneral;
- // Now add headers for data columns
- for (int i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += AnsiString(Headers->Strings[InColumn[i]]);
- }
- DataFile->Add(OutString);
- {
- OutString = AnsiString("EWEIGHT");
- OutString += AnsiString("t");
- OutString += AnsiString("t");
- for (int i=0;i<Columns;i++)
- {
- OutString += "t" + AnsiString(ArrayWeight[i]);
- }
- }
- DataFile->Add(OutString);
- int index, colindex;
- for (int i=0;i<Rows;i++)
- {
- index = i;
- OutString = "";
- OutString += AnsiString(UniqID[index]) + "t" + AnsiString(GeneName[index]);
- OutString += "t" + AnsiString(GeneWeight[index]);
- for (int j=0;j<Columns;j++)
- {
- colindex = j;
- if (GeneMask[index][colindex] == true)
- {
- OutString += "t" + AnsiString(FloatToStrF(GeneData[index][colindex],Format,4,2));
- }
- else
- {
- OutString += "t";
- }
- }
- DataFile->Add(OutString);
- }
- DataFile->SaveToFile(FileName);
- delete DataFile;
- }
- void __fastcall TMainForm::SaveButtonClick(TObject *Sender)
- {
- SaveDataDialog->FileName = JobNameEdit->Text + ".txt";
- if (SaveDataDialog->Execute())
- {
- SaveData(SaveDataDialog->FileName);
- }
- }
- //---------------------------------------------------------------------------
- /* File Format Help Button Clicked */
- void __fastcall TMainForm::FileHelpClick(TObject *Sender)
- {
- FileFormatForm->Show();
- }
- /* Open Cluster Manual in Browser Window */
- void __fastcall TMainForm::ManualButtonClick(TObject *Sender)
- {
- ShellExecute(Handle, "open", "http://rana.stanford.edu/software/manuals/ClusterTreeView.pdf", 0, 0, SW_SHOWNORMAL);
- }
- //---------------------------------------------------------------------------
- /* Filter data. Apply user selected criteria to flag (for subsequent removal)
- rows that fail to pass tests. Note that filters are assessed here and applied
- separately so the user can adjust parameters to get appropriate number
- of rows passing */
- void __fastcall TMainForm::FilterClick(TObject *Sender)
- {
- FilterResultsLabel->Visible = false;
- AcceptFilterButton->Visible = false;
- /* Store results in boolean Use */
- delete Use;
- Use = new bool[Rows];
- UseRows = 0;
- for (int Row=0;Row<Rows;Row++)
- {
- StatusBar1->SimpleText = "Assessing Filters for Gene " + AnsiString(Row);
- int Count = 0;
- int CountAbs = 0;
- double Sum = 0;
- double Sum2 = 0;
- double Min = 10000000;
- double Max = -10000000;
- /* Compute some row stats */
- for (int Column=0;Column<Columns;Column++)
- {
- if (GeneMask[Row][Column])
- {
- Sum += GeneData[Row][Column];
- Sum2 += pow(GeneData[Row][Column],2);
- Count ++;
- Min = min(GeneData[Row][Column],Min);
- Max = max(GeneData[Row][Column],Max);
- if (fabs(GeneData[Row][Column]) >= FilterAbsValVal)
- {
- CountAbs++;
- }
- }
- }
- Use[Row] = true;
- /* Filter based on percent values present; remove rows
- with too many missing values */
- if (FilterPercentCheckBox->Checked)
- {
- double PercentPresent = (double) Count / (double) Columns;
- if ( (100 * PercentPresent) < FilterPercentVal )
- {
- Use[Row] = false;
- }
- }
- /* Remove rows with low SD */
- if (FilterSDCheckBox->Checked)
- {
- if (Count > 1)
- {
- double Ave = Sum / (double) Count;
- double Arg = (Sum2 - 2 * Ave * Sum + (double) Count * pow(Ave,2))/ ((double)(Count-1));
- double SD = sqrt(Arg);
- if (SD < FilterSDVal)
- {
- Use[Row] = false;
- }
- }
- else
- {
- Use[Row] = false;
- }
- }
- /* Remove rows with too few extreme values */
- if (FilterAbsValCheckBox->Checked)
- {
- if (CountAbs < FilterAbsValCount)
- {
- Use[Row] = false;
- }
- }
- /* Remove rows with too small Max-Min */
- if (FilterMaxMinCheckBox->Checked)
- {
- if (Max - Min < FilterMaxMinVal)
- {
- Use[Row] = false;
- }
- }
- if (Use[Row])
- {
- UseRows++;
- }
- }
- FilterResultsLabel->Visible = true;
- /* Tell user how many rows passed */
- FilterResultsLabel->Caption = AnsiString(UseRows) + " passed out of " +
- AnsiString(Rows);
- AcceptFilterButton->Visible = true;
- StatusBar1->SimpleText = "Done Analyzing Filters";
- }
- //---------------------------------------------------------------------------
- /* Accept results of last filtering */
- void __fastcall TMainForm::AcceptFilterButtonClick(TObject *Sender)
- {
- AcceptFilterButton->Visible = false;
- AnsiString *TempID;
- AnsiString *TempName;
- double *TempOrder;
- double *TempWeight;
- double **TempData;
- bool **TempMask;
- TempID = new AnsiString[Rows];
- TempName = new AnsiString[Rows];
- TempOrder = new double[Rows];
- TempWeight = new double[Rows];
- TempData = new double*[Rows];
- TempMask = new bool*[Rows];
- for (int Row=0;Row<Rows;Row++)
- {
- TempData[Row] = new double[Columns];
- TempMask[Row] = new bool[Columns];
- for (int Column=0;Column<Columns;Column++)
- {
- TempData[Row][Column] = GeneData[Row][Column];
- TempMask[Row][Column] = GeneMask[Row][Column];
- }
- TempID[Row] = UniqID[Row];
- TempName[Row] = GeneName[Row];
- TempOrder[Row] = GeneOrder[Row];
- TempWeight[Row] = GeneWeight[Row];
- }
- for (int Row=0;Row<Rows;Row++)
- {
- delete GeneData[Row];
- delete GeneMask[Row];
- UniqID[Row] = "";
- GeneName[Row] = "";
- }
- delete GeneData;
- delete GeneMask;
- delete UniqID;
- delete GeneName;
- delete GeneOrder;
- delete GeneWeight;
- UniqID = new AnsiString[2*UseRows-1];
- GeneName = new AnsiString[2*UseRows-1];
- GeneOrder = new double[2*UseRows-1];
- GeneWeight = new double[2*UseRows-1];
- GeneData = new double*[2*UseRows-1];
- GeneMask = new bool*[2*UseRows-1];
- for (int Row=0;Row<2*UseRows-1;Row++)
- {
- GeneData[Row] = new double[Columns];
- GeneMask[Row] = new bool[Columns];
- }
- int Counter = 0;
- for (int Row=0;Row<Rows;Row++)
- {
- if (Use[Row])
- {
- for (int Column=0;Column<Columns;Column++)
- {
- GeneData[Counter][Column] = TempData[Row][Column];
- GeneMask[Counter][Column] = TempMask[Row][Column];
- }
- UniqID[Counter] = TempID[Row];
- GeneName[Counter] = TempName[Row];
- GeneOrder[Counter] = TempOrder[Row];
- GeneWeight[Counter] = TempWeight[Row];
- Counter++;
- }
- }
- for (int Row=0;Row<Rows;Row++)
- {
- delete TempData[Row];
- delete TempMask[Row];
- TempName[Row] = "";
- TempID[Row] = "";
- }
- delete TempData;
- delete TempMask;
- delete TempID;
- delete TempName;
- delete TempOrder;
- delete TempWeight;
- Rows = Counter;
- /* Update SOM defaults to reflect new number of rows */
- SOMGenesYDim = 1 + sqrt((double)Rows);
- SOMGenesYDimEdit->Text = SOMGenesYDim;
- SOMArraysYDim = 1 + sqrt((double)Columns);
- SOMArraysYDimEdit->Text = SOMArraysYDim;
- }
- //---------------------------------------------------------------------------
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FilterPercentEditExit(TObject *Sender)
- {
- double TempVal = FilterPercentVal;
- try
- {
- FilterPercentVal = FilterPercentEdit->Text.ToDouble();
- }
- catch (EConvertError &E)
- {
- FilterPercentVal = TempVal;
- FilterPercentEdit->Text = FilterPercentVal;
- }
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FilterSDEditExit(TObject *Sender)
- {
- double TempVal = FilterSDVal;
- try
- {
- FilterSDVal = FilterSDEdit->Text.ToDouble();
- }
- catch (EConvertError &E)
- {
- FilterSDVal = TempVal;
- FilterSDEdit->Text = FilterSDVal;
- }
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FilterAbsValCountEditExit(TObject *Sender)
- {
- int TempVal = FilterAbsValCount;
- try
- {
- FilterAbsValCount = FilterAbsValCountEdit->Text.ToDouble();
- }
- catch (EConvertError &E)
- {
- FilterAbsValCount = TempVal;
- FilterAbsValCountEdit->Text = FilterAbsValCount;
- }
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FilterAbsValEditExit(TObject *Sender)
- {
- double TempVal = FilterAbsValVal;
- try
- {
- FilterAbsValVal = FilterAbsValEdit->Text.ToDouble();
- }
- catch (EConvertError &E)
- {
- FilterAbsValVal = TempVal;
- FilterAbsValEdit->Text = FilterAbsValVal;
- }
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::FilterMaxMinEditExit(TObject *Sender)
- {
- double TempVal = FilterMaxMinVal;
- try
- {
- FilterMaxMinVal = FilterMaxMinEdit->Text.ToDouble();
- }
- catch (EConvertError &E)
- {
- FilterMaxMinVal = TempVal;
- FilterMaxMinEdit->Text = FilterMaxMinVal;
- }
- }
- //---------------------------------------------------------------------------
- /* Adjust data values in various ways */
- void __fastcall TMainForm::AdjustDataButtonClick(TObject *Sender)
- {
- for (int Row=0; Row<Rows; Row++)
- {
- StatusBar1->SimpleText = "Adjusting Data for Gene " + AnsiString(Row+1)
- + " of " + AnsiString(Rows);
- float *DataPoints = vector(1,Columns);
- int CountDataPoints = 0;
- float DataSum = 0;
- for (int Column=0;Column<Columns;Column++)
- {
- if ( LogTransformCheckBox->Checked && GeneMask[Row][Column] )
- {
- if (GeneData[Row][Column] > 0)
- {
- GeneData[Row][Column] = log(GeneData[Row][Column])/log(2);
- }
- else
- {
- GeneMask[Row][Column] = 0;
- }
- }
- if (GeneMask[Row][Column])
- {
- CountDataPoints++;
- DataPoints[CountDataPoints] = GeneData[Row][Column];
- DataSum += GeneData[Row][Column];
- }
- }
- if (MeanCenterGenesCheckBox->Checked)
- {
- if (CountDataPoints > 0)
- {
- float DataMean = DataSum / (float) CountDataPoints;
- for (int Column=0;Column<Columns;Column++)
- {
- GeneData[Row][Column] -= DataMean;
- }
- }
- }
- else if (MedianCenterGenesCheckBox->Checked)
- {
- float Median;
- if (CountDataPoints > 1)
- {
- if ( (CountDataPoints % 2) == 0)
- {
- Median = 0.5 * ( select((CountDataPoints/2),CountDataPoints,DataPoints)
- + select(1+(CountDataPoints/2),CountDataPoints,DataPoints));
- }
- else
- {
- Median = select(((CountDataPoints+1)/2),CountDataPoints,DataPoints);
- }
- }
- else if (CountDataPoints == 1)
- {
- Median = DataPoints[0];
- }
- for (int Column=0;Column<Columns;Column++)
- {
- if (GeneMask[Row][Column])
- {
- GeneData[Row][Column] -= Median;
- }
- }
- }
- if (NormalizeGenesCheckBox->Checked)
- {
- float DataSum2 = 0;
- for (int Column=0;Column<Columns;Column++)
- {
- if (GeneMask[Row][Column])
- {
- DataSum2 += pow(GeneData[Row][Column],2);
- }
- }
- float DataMag = sqrt(DataSum2);
- if (DataMag > 0)
- {
- for (int Column=0;Column<Columns;Column++)
- {
- if (GeneMask[Row][Column])
- {
- GeneData[Row][Column] /= DataMag;
- }
- }
- }
- }
- free_vector(DataPoints,1,Columns);
- }
- for (int Column=0; Column<Columns; Column++)
- {
- StatusBar1->SimpleText = "Adjusting Data for Sample " + AnsiString(Column+1)
- + " of " + AnsiString(Columns);
- float *DataPoints = vector(1,Rows);
- int CountDataPoints = 0;
- float DataSum = 0;
- for (int Row=0;Row<Rows;Row++)
- {
- if (GeneMask[Row][Column])
- {
- CountDataPoints++;
- DataPoints[CountDataPoints] = GeneData[Row][Column];
- DataSum += GeneData[Row][Column];
- }
- }
- if (MeanCenterArraysCheckBox->Checked)
- {
- if (CountDataPoints > 0)
- {
- float DataMean = DataSum / (float) CountDataPoints;
- for (int Row=0;Row<Rows;Row++)
- {
- GeneData[Row][Column] -= DataMean;
- }
- }
- }
- else if (MedianCenterArraysCheckBox->Checked)
- {
- float Median;
- if (CountDataPoints > 1)
- {
- if ( (CountDataPoints % 2) == 0)
- {
- Median = 0.5 * ( select((CountDataPoints/2),CountDataPoints,DataPoints)
- + select(1+(CountDataPoints/2),CountDataPoints,DataPoints));
- }
- else
- {
- Median = select(((CountDataPoints+1)/2),CountDataPoints,DataPoints);
- }
- }
- else if (CountDataPoints == 1)
- {
- Median = DataPoints[0];
- }
- for (int Row=0;Row<Rows;Row++)
- {
- if (GeneMask[Row][Column])
- {
- GeneData[Row][Column] -= Median;
- }
- }
- }
- if (NormalizeArraysCheckBox->Checked)
- {
- float DataSum2 = 0;
- for (int Row=0;Row<Rows;Row++)
- {
- if (GeneMask[Row][Column])
- {
- DataSum2 += pow(GeneData[Row][Column],2);
- }
- }
- float DataMag = sqrt(DataSum2);
- if (DataMag > 0)
- {
- for (int Row=0;Row<Rows;Row++)
- {
- if (GeneMask[Row][Column])
- {
- GeneData[Row][Column] /= DataMag;
- }
- }
- }
- }
- free_vector(DataPoints,1,Rows);
- }
- StatusBar1->SimpleText = "Done Adjusting Data";
- }
- /* Distance Functions for Clustering */
- /* Old Function that returns only Pearson Correlation */
- unsigned short TMainForm::Distance(double **Data, bool **Mask, double *Weight,
- bool Centered, bool Absolute, int elem1, int elem2, int DataColumns)
- {
- return Correlation(Data,Mask,Weight,Centered,Absolute,elem1,elem2,DataColumns);
- }
- unsigned short TMainForm::Distance(int Metric, double **Data, bool **Mask, double *Weight,
- int elem1, int elem2, int DataColumns)
- {
- int i;
- int k;
- float *Vector1;
- float *Vector2;
- float d,zd,probd,rs,probrs;
- float tau, tauz, taup;
- switch (Metric)
- {
- /* Uncentered correlation */
- case 0:
- default:
- return Correlation(Data,Mask,Weight,false,false,elem1,elem2,DataColumns);
- break;
- case 2:
- return Correlation(Data,Mask,Weight,false,true,elem1,elem2,DataColumns);
- break;
- /* Centered correlation */
- case 1:
- return Correlation(Data,Mask,Weight,true,false,elem1,elem2,DataColumns);
- break;
- case 3:
- return Correlation(Data,Mask,Weight,true,true,elem1,elem2,DataColumns);
- break;
- /* Euclidean distance */
- //case 4:
- /* Spearman Rank */
- case 4:
- rs = 0;
- #ifdef NUMREC
- Vector1 = vector(1,DataColumns);
- Vector2 = vector(1,DataColumns);
- k=0;
- for (i=0; i<DataColumns; i++)
- {
- if ( (Mask[elem1][i] == true) && (Mask[elem2][i] == true) )
- {
- k++;
- Vector1[k] = Data[elem1][i];
- Vector2[k] = Data[elem2][i];
- }
- }
- if (k > 1)
- {
- spear(Vector1,Vector2,k,&d,&zd,&probd,&rs,&probrs);
- }
- else
- {
- rs = 0;
- }
- free_vector(Vector1,1,DataColumns);
- free_vector(Vector2,1,DataColumns);
- #endif
- return (unsigned short) (16384.0 * (1.0 - rs));
- break;
- /* Kendall's Tau */
- case 5:
- k = 0;
- tau = 0;
- #ifdef NUMREC
- Vector1 = vector(1,DataColumns);
- Vector2 = vector(1,DataColumns);
- for (i=0; i<DataColumns; i++)
- {
- if ( (Mask[elem1][i] == true) && (Mask[elem2][i] == true) )
- {
- k++;
- Vector1[k] = Data[elem1][i];
- Vector2[k] = Data[elem2][i];
- }
- }
- if (k > 1)
- {
- kendl1(Vector1, Vector2, k, &tau, &tauz, &taup);
- }
- else
- {
- tau = 0;
- }
- free_vector(Vector1,1,DataColumns);
- free_vector(Vector2,1,DataColumns);
- #endif
- return (unsigned short) (16384.0 * (1.0 - tau));
- break;
- }
- }
- /* Pearson Correlation */
- unsigned short TMainForm::Correlation(double **Data, bool **Mask, double *Weight,
- bool Centered, bool Absolute, int elem1, int elem2, int DataColumns)
- {
- int i;
- double Sum1, Sum2;
- double Sum11, Sum22;
- double Sum12;
- double Ave1, Ave2;
- double Norm;
- double Corr = -1.0;
- double Count = 0;
- Sum1 = 0;
- Sum2 = 0;
- Sum11 = 0;
- Sum22 = 0;
- Sum12 = 0;
- for (i=0;i<DataColumns;i++)
- {
- try
- {
- if ( (Mask[elem1][i] == true) && (Mask[elem2][i] == true) )
- {
- Sum1 += Weight[i]*Data[elem1][i];
- Sum2 += Weight[i]*Data[elem2][i];
- Sum11 += Weight[i]*Data[elem1][i] * Data[elem1][i];
- Sum22 += Weight[i]*Data[elem2][i] * Data[elem2][i];
- Sum12 += Weight[i]*Data[elem1][i] * Data[elem2][i];
- Count += Weight[i];
- }
- }
- catch (Exception &E)
- {
- Sum1 = 0;
- }
- }
- if (Count > 0)
- {
- if (Centered)
- {
- Ave1 = Sum1/Count;
- Ave2 = Sum2/Count;
- }
- else
- {
- Ave1 = 0;
- Ave2 = 0;
- }
- try
- {
- Norm = sqrt(
- max(0.0,Sum11 - Count * Ave1 * Ave1)
- *
- max(0.0,Sum22 - Count * Ave2 * Ave2)
- );
- }
- catch (Exception &E)
- {
- }
- if ( (Norm > 0) )
- {
- Corr = (Sum12 - Count * Ave1 * Ave2)
- / (Norm);
- }
- if (Absolute == true)
- {
- return (unsigned short) (32768.0 * (1.0 - fabs(Corr)));
- }
- else
- {
- return (unsigned short) (16384.0 * (1.0 - Corr));
- }
- }
- else
- {
- return 0;
- }
- }
- /* Pearson Correlation without Centered and Absolute options */
- float TMainForm::Correlation(double **Data, bool **Mask, double *Weight,
- int elem1, int elem2, int DataColumns)
- {
- int i;
- bool Centered = false;
- //bool Absolute = false;
- double Sum1, Sum2;
- double Sum11, Sum22;
- double Sum12;
- double Ave1, Ave2;
- double Norm1, Norm2;
- float Corr = -1.0;
- double Count = 0;
- Sum1 = 0;
- Sum2 = 0;
- Sum11 = 0;
- Sum22 = 0;
- Sum12 = 0;
- for (i=0;i<DataColumns;i++)
- {
- try
- {
- if ( (Mask[elem1][i] == true) && (Mask[elem2][i] == true) )
- {
- Sum1 += Weight[i]*Data[elem1][i];
- Sum2 += Weight[i]*Data[elem2][i];
- Sum11 += Weight[i]*Data[elem1][i] * Data[elem1][i];
- Sum22 += Weight[i]*Data[elem2][i] * Data[elem2][i];
- Sum12 += Weight[i]*Data[elem1][i] * Data[elem2][i];
- Count += Weight[i];
- }
- }
- catch (Exception &E)
- {
- Sum1 = 0;
- }
- }
- if (Count > 0)
- {
- if (Centered)
- {
- Ave1 = Sum1/Count;
- Ave2 = Sum2/Count;
- }
- else
- {
- Ave1 = 0;
- Ave2 = 0;
- }
- try
- {
- Norm1 = sqrt(max(0.0,Sum11 - 2 * Ave1 * Sum1 + Count * Ave1 * Ave1));
- Norm2 = sqrt(max(0.0,Sum22 - 2 * Ave2 * Sum2 + Count * Ave2 * Ave2));
- }
- catch (Exception &E)
- {
- }
- if ( (Norm1 > 0) && (Norm2 > 0) )
- {
- Corr = (Sum12 - Sum1 * Ave2 - Sum2 * Ave1 + Count * Ave1 * Ave2)
- / (Norm1 * Norm2);
- }
- return Corr;
- }
- else
- {
- return 0;
- }
- }
- /* Button Pressed for Average Linkage Hierarchical Clustering: Setup and Execute */
-
void __fastcall TMainForm::AverageLinkageClusterButtonClick(TObject *Sender)
- {
- int i,j;
- StatusBar1->SimpleText = "Initializing";
- // Get ClusterID for file names from user input
- JobName = AnsiString(ClusterName);
- if (JobNameEdit->Text.Length() > 0)
- {
- JobName = JobNameEdit->Text;
- }
- bool ClusterGenes = ClusterGenesCheckBox->Checked;
- bool ClusterArrays = ClusterArraysCheckBox->Checked;
- bool CalculateGeneWeights = CalculateGeneWeightsCheckBox->Checked;
- bool CalculateArrayWeights = CalculateArrayWeightsCheckBox->Checked;
- int *ColumnOrder = new int[Columns];
- AnsiString *ArrayID = new AnsiString[2*Columns-1];
- for (i=0;i<Columns;i++)
- {
- ColumnOrder[i] = i;
- ArrayID[i] = "ARRY" + AnsiString(i) + "X";
- }
- if (CalculateGeneWeights == true)
- {
- // Generate array data structures
- // This is a wasteful way of doing it, but makes
- // the coding easier for me
- double **ArrayData = new double*[2*Columns-1];
- bool **ArrayMask = new bool*[2*Columns-1];
- for (i=0;i<2*Columns-1;i++)
- {
- ArrayData[i] = new double[Rows];
- ArrayMask[i] = new bool[Rows];
- if (i<Columns)
- {
- for (j=0;j<Rows;j++)
- {
- ArrayData[i][j] = GeneData[j][i];
- ArrayMask[i][j] = GeneMask[j][i];
- }
- }
- }
- TStringList *ArrayTreeFile = new TStringList();
- TStringList *ArrayClusterOrder = new TStringList();
- Cluster(ArrayData,ArrayMask,Columns,Rows,
- ArrayTreeFile,ArrayOrder,ArrayWeight,GeneWeight,ArrayID,
- ArrayMetricComboBox->ItemIndex,
- true,ArrayWeightCutoff,ArrayWeightExp,
- ArrayClusterOrder,StatusBar1,false,NULL);
- delete ArrayClusterOrder;
- for (i=0;i<2*Columns-1;i++)
- {
- delete ArrayData[i];
- delete ArrayMask[i];
- }
- delete ArrayData;
- delete ArrayMask;
- }
- int *RowOrder = new int[Rows];
- AnsiString *GeneID = new AnsiString[2*Rows-1];
- for (i=0;i<Rows;i++)
- {
- RowOrder[i] = i;
- GeneID[i] = AnsiString("GENE") + AnsiString(i) + AnsiString("X");
- }
- if (ClusterGenes == true)
- {
- TStringList *GeneTreeFile = new TStringList();
- TStringList *GeneClusterOrder = new TStringList();
- if (CalculateArrayWeights == true)
- {
- Cluster(GeneData,GeneMask,Rows,Columns,
- GeneTreeFile,GeneOrder,GeneWeight, ArrayWeight,GeneID,
- GeneMetricComboBox->ItemIndex,
- true,GeneWeightCutoff,GeneWeightExp,
- GeneClusterOrder,StatusBar1,false,NULL);
- }
- else
- {
- double *NullGeneWeight = new double[Rows];
- Cluster(GeneData,GeneMask,Rows,Columns,
- GeneTreeFile,GeneOrder,NullGeneWeight,ArrayWeight,GeneID,
- GeneMetricComboBox->ItemIndex,
- false,1.0,1.0,
- GeneClusterOrder,StatusBar1,false,NULL);
- delete NullGeneWeight;
- }
- for (i=0;i<GeneClusterOrder->Count;i++)
- {
- RowOrder[i] = GeneClusterOrder->Strings[i].ToInt();
- }
- AnsiString TreeFileName = JobName + ".gtr";
- GeneTreeFile->SaveToFile(TreeFileName);
- }
- if (ClusterArrays == true)
- {
- // Generate array data structures
- // This is a wasteful way of doing it, but makes
- // the coding easier for me
- double **ArrayData = new double*[2*Columns-1];
- bool **ArrayMask = new bool*[2*Columns-1];
- for (i=0;i<2*Columns-1;i++)
- {
- ArrayData[i] = new double[Rows];
- ArrayMask[i] = new bool[Rows];
- if (i<Columns)
- {
- for (j=0;j<Rows;j++)
- {
- ArrayData[i][j] = GeneData[j][i];
- ArrayMask[i][j] = GeneMask[j][i];
- }
- }
- }
- TStringList *ArrayTreeFile = new TStringList();
- TStringList *ArrayClusterOrder = new TStringList();
- Cluster(ArrayData,ArrayMask,Columns,Rows,
- ArrayTreeFile,ArrayOrder,ArrayWeight,GeneWeight,ArrayID,
- ArrayMetricComboBox->ItemIndex,
- false,1.0,1.0,ArrayClusterOrder,StatusBar1,false,NULL);
- for (i=0;i<ArrayClusterOrder->Count;i++)
- {
- ColumnOrder[i] = ArrayClusterOrder->Strings[i].ToInt();
- }
- delete ArrayClusterOrder;
- for (i=0;i<2*Columns-1;i++)
- {
- delete ArrayData[i];
- delete ArrayMask[i];
- }
- delete ArrayData;
- delete ArrayMask;
- AnsiString ArrayTreeFileName = JobName + ".atr";
- ArrayTreeFile->SaveToFile(ArrayTreeFileName);
- delete ArrayTreeFile;
- }
- AnsiString OutString;
- TStringList *DataFile = new TStringList();
- // Now make output .cdt file
- OutString = "";
- if (ClusterGenes == true)
- {
- OutString += AnsiString("GID") + AnsiString("t");
- }
- OutString += Headers->Strings[0] + AnsiString("t");
- OutString += AnsiString("NAME") + AnsiString("t");
- OutString += "GWEIGHT";
- // Now add headers for data columns
- for (i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += AnsiString(Headers->Strings[InColumn[ColumnOrder[i]]]);
- }
- DataFile->Add(OutString);
- if (ClusterArrays == true)
- {
- OutString = AnsiString("AID");
- if (ClusterGenes == true)
- {
- OutString += AnsiString("t");
- }
- OutString += AnsiString("t");
- OutString += AnsiString("t");
- for (i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += ArrayID[ColumnOrder[i]];
- }
- DataFile->Add(OutString);
- }
- {
- OutString = AnsiString("EWEIGHT");
- if (ClusterGenes == true)
- {
- OutString += AnsiString("t");
- }
- OutString += AnsiString("t");
- OutString += AnsiString("t");
- for (i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += ArrayWeight[ColumnOrder[i]];
- }
- }
- DataFile->Add(OutString);
- int index;
- TFloatFormat Format = ffGeneral;
- for (i=0;i<Rows;i++)
- {
- index = RowOrder[i];
- OutString = "";
- if (ClusterGenes == true)
- {
- OutString += GeneID[index] + "t";
- }
- OutString += AnsiString(UniqID[index]) + "t" + AnsiString(GeneName[index]);
- OutString += "t" + AnsiString(GeneWeight[index]);
- for (j=0;j<Columns;j++)
- {
- if (GeneMask[index][ColumnOrder[j]] == true)
- {
- OutString += "t" + AnsiString(FloatToStrF(GeneData[index][ColumnOrder[j]],Format,4,2));
- }
- else
- {
- OutString += "t";
- }
- }
- DataFile->Add(OutString);
- }
- AnsiString DataFileName = JobName + ".cdt";
- DataFile->SaveToFile(DataFileName);
- for (i=0;i<2*Columns-1;i++)
- {
- ArrayID[i] = "";
- }
- for (i=0;i<2*Rows-1;i++)
- {
- GeneID[i] = "";
- }
- delete ArrayID;
- delete GeneID;
- delete ColumnOrder;
- delete RowOrder;
- StatusBar1->SimpleText = "Done Clustering";
- }
- //---------------------------------------------------------------------------
- /* Average Linkage Hierarchical Clustering Code */
- void TMainForm::Cluster(double **ClusterData, bool **ClusterMask,
- int ClusterRows, int ClusterColumns,
- TStringList *TreeFile, double *Order, double *RowWeight, double *ColumnWeight,
- AnsiString *ID, int DistFunction,
- bool CalculateWeights, double WeightCutoff, double WeightPower,
- TStringList *ClusterOrder, TStatusBar *StatusBar,
- bool ReturnTNode, TNode *TopNode)
- {
- int i,j,k,l,m;
- unsigned short *minval;
- int *minpair;
- int *elements;
- AnsiString OutString;
- unsigned short **Dist;
- int **NodeElement;
- double *NodeDistance;
- double *CalcWeight;
- int element1, element2;
- Dist = new unsigned short*[2*ClusterRows-1];
- minval = new unsigned short[2*ClusterRows-1];
- minpair = new int[2*ClusterRows-1];
- elements = new int[2*ClusterRows-1];
- NodeElement = new int*[2*ClusterRows-1];
- NodeDistance = new double[2*ClusterRows-1];
- CalcWeight = new double[2*ClusterRows-1];
- bool SaveDistances = false;
- double WeightDist;
- for (i=0;i<ClusterRows;i++)
- {
- Dist[i] = new unsigned short[2*ClusterRows-1];
- }
- for (i=0;i<2*ClusterRows-1;i++)
- {
- minval[i] = 32769;
- elements[i] = 1;
- NodeElement[i] = new int[2];
- CalcWeight[i] = 1;
- NodeDistance[i] = 0.0;
- }
- bool Centered = false;
- if ((DistFunction == 1) || (DistFunction == 3) )
- {
- Centered = true;
- }
- bool Absolute = false;
- if ((DistFunction == 3) || (DistFunction == 4) )
- {
- Absolute = true;
- }
- bool CalcInferredDistances = false;
- TStringList **MemberList;
- unsigned short **TreeDist;
- if (CalcInferredDistances == true)
- {
- MemberList = new TStringList*[2*ClusterRows-1];
- TreeDist = new unsigned short*[ClusterRows];
- for (i=0;i<2*ClusterRows-1;i++)
- {
- MemberList[i] = new TStringList();
- if (i<ClusterRows)
- {
- MemberList[i]->Add(AnsiString(i));
- TreeDist[i] = new unsigned short[ClusterRows];
- }
- }
- }
- int TotalDistance = ClusterRows * (ClusterRows - 1) / 2;
- /* First step is to compute the distance matrix
- This is the slowest step */
- for (i=0;i<ClusterRows;i++)
- {
- Dist[i][i] = 0.0;
- for (j=0;j<i;j++)
- {
- Dist[i][j] = Distance(DistFunction,ClusterData,ClusterMask,ColumnWeight,
- i,j,ClusterColumns);
- Dist[j][i] = Dist[i][j];
- if (CalculateWeights == true)
- {
- WeightDist = ( (double) (32768 - Dist[i][j])) /32768.0;
- WeightDist = max(0.0, (WeightDist - WeightCutoff)/(1.0 - WeightCutoff));
- WeightDist = pow(WeightDist,WeightPower);
- CalcWeight[i] += WeightDist;
- CalcWeight[j] += WeightDist;
- }
- if (Dist[i][j] < minval[i])
- {
- minval[i] = Dist[i][j];
- minpair[i] = j;
- }
- if (Dist[i][j] < minval[j])
- {
- minval[j] = Dist[i][j];
- minpair[j] = i;
- }
- }
- StatusBar->SimpleText = "Calculating Distances " + AnsiString(i*(i+1)/2) + " of "
- + AnsiString(TotalDistance);
- Application->ProcessMessages();
- }
- TStringList *MinList = new TStringList();
- for (i=0;i<ClusterRows;i++)
- {
- MinList->Add(AnsiString(minval[i]));
- }
- MinList->SaveToFile("minlist.txt");
- delete MinList;
- bool *Active;
- Active = new bool[2*ClusterRows-1];
- for (i=0;i<ClusterRows;i++)
- {
- Active[i] = true;
- }
- for (i=ClusterRows;i<2*ClusterRows-1;i++)
- {
- Active[i] = false;
- }
- unsigned short minminval;
- int min1, min2;
- int nodeindex;
- //int node;
- /* Now we join nodes */
- /* If we are going to return TNode, need to set up TNodes */
- TNode **Nodes;
- if (ReturnTNode)
- {
- Nodes = new TNode*[2*ClusterRows-1];
- for (i=0;i<2*ClusterRows-1;i++)
- {
- Nodes[i] = new TNode();
- if (i < ClusterRows)
- {
- Nodes[i]->IsNode = false;
- Nodes[i]->ID = AnsiString(i);
- }
- else
- {
- Nodes[i]->IsNode = true;
- }
- }
- }
- for (nodeindex=ClusterRows; nodeindex<2*ClusterRows-1;nodeindex++)
- {
- ID[nodeindex] = "NODE" + AnsiString(nodeindex-ClusterRows+1) + "X";
- minminval = 32769;
- for (i=0;i<2*ClusterRows-1;i++)
- {
- if (Active[i] == true)
- {
- if (minval[i] < minminval)
- {
- minminval = minval[i];
- min1 = i;
- min2 = minpair[i];
- }
- }
- }
- /* When we get here min1 and min2 are the elements that are to be joined */
- Active[min1] = false;
- Active[min2] = false;
- if (Order[min1] > Order[min2])
- {
- NodeElement[nodeindex][0] = min1;
- NodeElement[nodeindex][1] = min2;
- }
- else
- {
- NodeElement[nodeindex][1] = min1;
- NodeElement[nodeindex][0] = min2;
- }
- NodeDistance[nodeindex] = ((double) minminval)/16384.0;
- NodeDistance[nodeindex] = max(NodeDistance[nodeindex],NodeDistance[min1]);
- NodeDistance[nodeindex] = max(NodeDistance[nodeindex],NodeDistance[min2]);
- if (ReturnTNode)
- {
- Nodes[nodeindex]->Child1 = Nodes[NodeElement[nodeindex][0]];
- Nodes[nodeindex]->Child2 = Nodes[NodeElement[nodeindex][1]];
- Nodes[nodeindex]->Length = 1.0 - NodeDistance[nodeindex];
- }
- if (CalcInferredDistances == true)
- {
- int elem1,elem2,ielem1,ielem2;
- for (elem1=0;elem1<MemberList[min1]->Count;elem1++)
- {
- for (elem2=0;elem2<MemberList[min2]->Count;elem2++)
- {
- ielem1 = (MemberList[min1]->Strings[elem1]).ToInt();
- ielem2 = (MemberList[min2]->Strings[elem2]).ToInt();
- TreeDist[ielem1][ielem2] = (NodeDistance[nodeindex] * 16384);
- TreeDist[ielem2][ielem1] = (NodeDistance[nodeindex] * 16384);
- }
- }
- MemberList[nodeindex]->AddStrings(MemberList[min1]);
- MemberList[nodeindex]->AddStrings(MemberList[min2]);
- delete MemberList[min1];
- delete MemberList[min2];
- }
- OutString = ID[nodeindex] + AnsiString("t") + ID[NodeElement[nodeindex][0]]
- + AnsiString("t") + ID[NodeElement[nodeindex][1]] + AnsiString("t")
- + AnsiString(1.0 - NodeDistance[nodeindex]);
- TreeFile->Add(OutString);
- StatusBar->SimpleText = "Generating Node " + AnsiString(nodeindex-ClusterRows+1) + " of " + AnsiString(ClusterRows-1);
- Application->ProcessMessages();
- elements[nodeindex] = elements[min1] + elements[min2];
- for (k=0;k<ClusterColumns;k++)
- {
- if ( (ClusterMask[min1][k] == true) && (ClusterMask[min2][k] == true) )
- {
- ClusterData[nodeindex][k] =
- (elements[min1] * ClusterData[min1][k] +
- elements[min2] * ClusterData[min2][k]) /
- (elements[min1] + elements[min2]);
- ClusterMask[nodeindex][k] = true;
- }
- else if (ClusterMask[min1][k] == true)
- {
- ClusterData[nodeindex][k] = ClusterData[min1][k];
- ClusterMask[nodeindex][k] = true;
- }
- else if (ClusterMask[min2][k] == true)
- {
- ClusterData[nodeindex][k] = ClusterData[min2][k];
- ClusterMask[nodeindex][k] = true;
- }
- else
- {
- ClusterMask[nodeindex][k] = false;
- }
- }
- Order[nodeindex] = (elements[min1]*Order[min1] + elements[min2]*Order[min2])/(elements[min1] + elements[min2]);
- Dist[nodeindex] = new unsigned short[2*ClusterRows-1];
- if (SaveDistances == false)
- {
- delete Dist[min1];
- delete Dist[min2];
- }
- for (l=0;l<2*ClusterRows-1;l++)
- {
- if (Active[l] == true)
- {
- Dist[nodeindex][l] =
- Distance(DistFunction,ClusterData,ClusterMask,ColumnWeight,
- nodeindex,l,ClusterColumns);
- Dist[l][nodeindex] = Dist[nodeindex][l];
- if (Dist[nodeindex][l] < minval[nodeindex])
- {
- minval[nodeindex] = Dist[nodeindex][l];
- minpair[nodeindex] = l;
- }
- //Rescan for min if minpair was one of the fused elements
- if ( (minpair[l] == min1) || (minpair[l] == min2) )
- {
- minval[l] = 32769;
- for (m=0;m<2*ClusterRows-1;m++)
- {
- if ( (Active[m] == true) && (l != m) )
- {
- if (Dist[l][m] < minval[l])
- {
- minval[l] = Dist[l][m];
- minpair[l] = m;
- }
- }
- }
- }
- //otherwise check new distance to see if it is new min
- else if (Dist[nodeindex][l] < minval[l])
- {
- minval[l] = Dist[nodeindex][l];
- minpair[l] = nodeindex;
- }
- }
- }
- Active[nodeindex] = true;
- }
- if (SaveDistances == false)
- {
- delete Dist[2*ClusterRows-2];
- }
- TList *NewList = new TList();
- TList *OutList = new TList();
- int **Elements;
- Elements = new int*[2*ClusterRows-1];
- for (i=0;i<2*ClusterRows-1;i++)
- {
- Elements[i] = new int;
- *Elements[i] = i;
- }
- OutList->Clear();
- OutList->Add((void *)Elements[2*ClusterRows-2]);
- int Position;
- bool Replaced;
- Replaced = true;
- while (Replaced == true)
- {
- Replaced = false;
- for (Position = 0; Position < OutList->Count; Position ++)
- {
- nodeindex = *(int *) OutList->Items[Position];
- if (nodeindex >= 0)
- {
- if (nodeindex < ClusterRows)
- {
- NewList->Add((void *) Elements[nodeindex]);
- }
- else
- {
- element1 = NodeElement[nodeindex][0];
- element2 = NodeElement[nodeindex][1];
- NewList->Add((void *) Elements[element1]);
- NewList->Add((void *) Elements[element2]);
- Replaced = true;
- }
- }
- }
- OutList->Clear();
- for (i=0;i<NewList->Count;i++)
- {
- OutList->Add(NewList->Items[i]);
- }
- NewList->Clear();
- }
- int index;
- ClusterOrder->Clear();
- if (SaveDistances == true)
- {
- if (DistFileDialogBox->Execute())
- {
- TFileStream *DistFile = new TFileStream(DistFileDialogBox->FileName,fmCreate);
- int ii, ij;
- for (i=0;i<OutList->Count;i++)
- {
- ii = *(int *)OutList->Items[i];
- for (j=0;j<OutList->Count;j++)
- {
- {
- ij = *(int *)OutList->Items[j];
- DistFile->Write(&Dist[ii][ij],2);
- }
- }
- }
- delete DistFile;
- for (i=0;i<2*ClusterRows-1;i++)
- {
- delete Dist[i];
- }
- }
- }
- if (CalcInferredDistances == true)
- {
- if (DistFileDialogBox->Execute())
- {
- TFileStream *DistFile = new TFileStream(DistFileDialogBox->FileName,fmCreate);
- int ii, ij;
- for (i=0;i<OutList->Count;i++)
- {
- ii = *(int *)OutList->Items[i];
- for (j=0;j<OutList->Count;j++)
- {
- {
- ij = *(int *)OutList->Items[j];
- DistFile->Write(&TreeDist[ii][ij],2);
- }
- }
- }
- delete DistFile;
- for (i=0;i<ClusterRows;i++)
- {
- delete TreeDist[i];
- }
- }
- delete TreeDist;
- }
- for (i=0;i<OutList->Count;i++)
- {
- index = *(int *)OutList->Items[i];
- ClusterOrder->Add(index);
- }
- for (i=0;i<2*ClusterRows-1;i++)
- {
- delete NodeElement[i];
- delete Elements[i];
- }
- if (CalculateWeights == true)
- {
- for (i=0;i<ClusterRows;i++)
- {
- RowWeight[i] = 1/CalcWeight[i];
- }
- }
- delete Dist;
- delete NodeElement;
- delete Elements;
- delete NodeDistance;
- delete elements;
- delete minval;
- delete minpair;
- delete NewList;
- delete OutList;
- delete Active;
- delete CalcWeight;
- }
- //---------------------------------------------------------------------------
- void __fastcall TMainForm::CompleteLinkageClusterButtonClick(TObject *Sender)
- {
- int i,j;
- StatusBar1->SimpleText = "Initializing";
- // Get ClusterID for file names from user input
- JobName = AnsiString(ClusterName);
- if (JobNameEdit->Text.Length() > 0)
- {
- JobName = JobNameEdit->Text;
- }
- bool ClusterGenes = ClusterGenesCheckBox->Checked;
- bool ClusterArrays = ClusterArraysCheckBox->Checked;
- bool CalculateGeneWeights = CalculateGeneWeightsCheckBox->Checked;
- bool CalculateArrayWeights = CalculateArrayWeightsCheckBox->Checked;
- int *ColumnOrder = new int[Columns];
- AnsiString *ArrayID = new AnsiString[2*Columns-1];
- for (i=0;i<Columns;i++)
- {
- ColumnOrder[i] = i;
- ArrayID[i] = "ARRY" + AnsiString(i) + "X";
- }
- if (CalculateGeneWeights == true)
- {
- // Generate array data structures
- // This is a wasteful way of doing it, but makes
- // the coding easier for me
- double **ArrayData = new double*[2*Columns-1];
- bool **ArrayMask = new bool*[2*Columns-1];
- for (i=0;i<2*Columns-1;i++)
- {
- ArrayData[i] = new double[Rows];
- ArrayMask[i] = new bool[Rows];
- if (i<Columns)
- {
- for (j=0;j<Rows;j++)
- {
- ArrayData[i][j] = GeneData[j][i];
- ArrayMask[i][j] = GeneMask[j][i];
- }
- }
- }
- TStringList *ArrayTreeFile = new TStringList();
- TStringList *ArrayClusterOrder = new TStringList();
- CompactCluster(ArrayData,ArrayMask,Columns,Rows,
- ArrayTreeFile,ArrayOrder,ArrayWeight,GeneWeight,ArrayID,
- ArrayMetricComboBox->ItemIndex,
- true,ArrayWeightCutoff,ArrayWeightExp,
- ArrayClusterOrder,StatusBar1,false,NULL);
- delete ArrayClusterOrder;
- for (i=0;i<2*Columns-1;i++)
- {
- delete ArrayData[i];
- delete ArrayMask[i];
- }
- delete ArrayData;
- delete ArrayMask;
- }
- int *RowOrder = new int[Rows];
- AnsiString *GeneID = new AnsiString[2*Rows-1];
- for (i=0;i<Rows;i++)
- {
- RowOrder[i] = i;
- GeneID[i] = AnsiString("GENE") + AnsiString(i) + AnsiString("X");
- }
- if (ClusterGenes == true)
- {
- TStringList *GeneTreeFile = new TStringList();
- TStringList *GeneClusterOrder = new TStringList();
- if (CalculateArrayWeights == true)
- {
- CompactCluster(GeneData,GeneMask,Rows,Columns,
- GeneTreeFile,GeneOrder,GeneWeight, ArrayWeight,GeneID,
- GeneMetricComboBox->ItemIndex,
- true,GeneWeightCutoff,GeneWeightExp,
- GeneClusterOrder,StatusBar1,false,NULL);
- }
- else
- {
- double *NullGeneWeight = new double[Rows];
- CompactCluster(GeneData,GeneMask,Rows,Columns,
- GeneTreeFile,GeneOrder,NullGeneWeight,ArrayWeight,GeneID,
- GeneMetricComboBox->ItemIndex,
- false,1.0,1.0,
- GeneClusterOrder,StatusBar1,false,NULL);
- delete NullGeneWeight;
- }
- for (i=0;i<GeneClusterOrder->Count;i++)
- {
- RowOrder[i] = GeneClusterOrder->Strings[i].ToInt();
- }
- AnsiString TreeFileName = JobName + ".gtr";
- GeneTreeFile->SaveToFile(TreeFileName);
- }
- if (ClusterArrays == true)
- {
- // Generate array data structures
- // This is a wasteful way of doing it, but makes
- // the coding easier for me
- double **ArrayData = new double*[2*Columns-1];
- bool **ArrayMask = new bool*[2*Columns-1];
- for (i=0;i<2*Columns-1;i++)
- {
- ArrayData[i] = new double[Rows];
- ArrayMask[i] = new bool[Rows];
- if (i<Columns)
- {
- for (j=0;j<Rows;j++)
- {
- ArrayData[i][j] = GeneData[j][i];
- ArrayMask[i][j] = GeneMask[j][i];
- }
- }
- }
- TStringList *ArrayTreeFile = new TStringList();
- TStringList *ArrayClusterOrder = new TStringList();
- CompactCluster(ArrayData,ArrayMask,Columns,Rows,
- ArrayTreeFile,ArrayOrder,ArrayWeight,GeneWeight,ArrayID,
- ArrayMetricComboBox->ItemIndex,
- false,1.0,1.0,ArrayClusterOrder,StatusBar1,false,NULL);
- for (i=0;i<ArrayClusterOrder->Count;i++)
- {
- ColumnOrder[i] = ArrayClusterOrder->Strings[i].ToInt();
- }
- delete ArrayClusterOrder;
- for (i=0;i<2*Columns-1;i++)
- {
- delete ArrayData[i];
- delete ArrayMask[i];
- }
- delete ArrayData;
- delete ArrayMask;
- AnsiString ArrayTreeFileName = JobName + ".atr";
- ArrayTreeFile->SaveToFile(ArrayTreeFileName);
- delete ArrayTreeFile;
- }
- AnsiString OutString;
- TStringList *DataFile = new TStringList();
- // Now make output .cdt file
- OutString = "";
- if (ClusterGenes == true)
- {
- OutString += AnsiString("GID") + AnsiString("t");
- }
- OutString += Headers->Strings[0] + AnsiString("t");
- OutString += AnsiString("NAME") + AnsiString("t");
- OutString += "GWEIGHT";
- // Now add headers for data columns
- for (i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += AnsiString(Headers->Strings[InColumn[ColumnOrder[i]]]);
- }
- DataFile->Add(OutString);
- if (ClusterArrays == true)
- {
- OutString = AnsiString("AID");
- if (ClusterGenes == true)
- {
- OutString += AnsiString("t");
- }
- OutString += AnsiString("t");
- OutString += AnsiString("t");
- for (i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += ArrayID[ColumnOrder[i]];
- }
- DataFile->Add(OutString);
- }
- {
- OutString = AnsiString("EWEIGHT");
- if (ClusterGenes == true)
- {
- OutString += AnsiString("t");
- }
- OutString += AnsiString("t");
- OutString += AnsiString("t");
- for (i=0;i<Columns;i++)
- {
- OutString += "t";
- OutString += ArrayWeight[ColumnOrder[i]];
- }
- }
- DataFile->Add(OutString);
- int index;
- TFloatFormat Format = ffGeneral;
- for (i=0;i<Rows;i++)
- {
- index = RowOrder[i];
- OutString = "";
- if (ClusterGenes == true)
- {
- OutString += GeneID[index] + "t";
- }
- OutString += AnsiString(UniqID[index]) + "t" + AnsiString(GeneName[index]);
- OutString += "t" + AnsiString(GeneWeight[index]);
- for (j=0;j<Columns;j++)
- {
- if (GeneMask[index][ColumnOrder[j]] == true)
- {
- OutString += "t" + AnsiString(FloatToStrF(GeneData[index][ColumnOrder[j]],Format,4,2));
- }
- else
- {
- OutString += "t";
- }
- }
- DataFile->Add(OutString);
- }
- AnsiString DataFileName = JobName + ".cdt";
- DataFile->SaveToFile(DataFileName);
- for (i=0;i<2*Columns-1;i++)
- {
- ArrayID[i] = "";
- }
- for (i=0;i<2*Rows-1;i++)
- {
- GeneID[i] = "";
- }
- delete ArrayID;
- delete GeneID;
- delete ColumnOrder;
- delete RowOrder;
- StatusBar1->SimpleText = "Done Clustering";
- }
- //---------------------------------------------------------------------------
- /* Compact/Complete Linkage Hierarchical Clustering Code */
- void TMainForm::CompactCluster(double **ClusterData, bool **ClusterMask,
- int ClusterRows, int ClusterColumns,
- TStringList *TreeFile, double *Order, double *RowWeight, double *ColumnWeight,
- AnsiString *ID, int DistFunction,
- bool CalculateWeights, double WeightCutoff, double WeightPower,
- TStringList *ClusterOrder, TStatusBar *StatusBar,
- bool ReturnTNode, TNode *TopNode)
- {
- int i,j,k,l,m;
- ReturnTNode = true;
- unsigned short *minval;
- int *minpair;
- int *elements;
- AnsiString OutString;
- unsigned short **Dist;
- int **NodeElement;
- double *NodeDistance;
- double *CalcWeight;
- int element1, element2;
-
- TStringList *List1 = new TStringList();
- TStringList *List2 = new TStringList();
- Dist = new unsigned short*[2*ClusterRows-1];
- minval = new unsigned short[2*ClusterRows-1];
- minpair = new int[2*ClusterRows-1];
- elements = new int[2*ClusterRows-1];
- NodeElement = new int*[2*ClusterRows-1];
- NodeDistance = new double[2*ClusterRows-1];
- CalcWeight = new double[2*ClusterRows-1];
- int *iList1 = new int[ClusterRows];
- int *iList2 = new int[ClusterRows];
- bool SaveDistances = true;
- double WeightDist;
- for (i=0;i<ClusterRows;i++)
- {
- Dist[i] = new unsigned short[2*ClusterRows-1];
- }
- for (i=0;i<2*ClusterRows-1;i++)
- {
- minval[i] = 32769;
- elements[i] = 1;
- NodeElement[i] = new int[2];
- CalcWeight[i] = 1;
- NodeDistance[i] = 0.0;
- }
- bool Centered = false;
- if ((DistFunction == 1) || (DistFunction == 3) )
- {
- Centered = true;
- }
- bool Absolute = false;
- if ((DistFunction == 3) || (DistFunction == 4) )
- {
- Absolute = true;
- }
- bool CalcInferredDistances = false;
- TStringList **MemberList;
- unsigned short **TreeDist;
- if (CalcInferredDistances == true)
- {
- MemberList = new TStringList*[2*ClusterRows-1];
- TreeDist = new unsigned short*[ClusterRows];
- for (i=0;i<2*ClusterRows-1;i++)
- {
- MemberList[i] = new TStringList();
- if (i<ClusterRows)
- {
- MemberList[i]->Add(AnsiString(i));
- TreeDist[i] = new unsigned short[ClusterRows];
- }
- }
- }
- int TotalDistance = ClusterRows * (ClusterRows - 1) / 2;
- /* First step is to compute the distance matrix
- This is the slowest step */
- for (i=0;i<ClusterRows;i++)
- {
- Dist[i][i] = 0.0;
- for (j=0;j<i;j++)
- {
- Dist[i][j] = Distance(DistFunction,ClusterData,ClusterMask,ColumnWeight,
- i,j,ClusterColumns);
- Dist[j][i] = Dist[i][j];
- if (CalculateWeights == true)
- {
- WeightDist = ( (double) (32768 - Dist[i][j])) /32768.0;
- WeightDist = max(0.0, (WeightDist - WeightCutoff)/(1.0 - WeightCutoff));
- WeightDist = pow(WeightDist,WeightPower);
- CalcWeight[i] += WeightDist;
- CalcWeight[j] += WeightDist;
- }