//*********************************************************************** // k_means.h // // this is an implementationof the K-meanss classification algorithm. // it accepts one attribute and measures the // // INPUTS: (from disk file) // control.dat - control file - space delimited // k value & datafile name // // datafile.in - classification set // attribute count - don't include the classification in // the count // data - space delimited // // OUTPUTS: (to disk file) // datafile.out - space delimited // test instance // k-nearest classification neighbor instances // //*********************************************************************** // WARNING: none // //*********************************************************************** // IMPLEMENTATION NOTE: all files are in the executable working directory // //*********************************************************************** // created by: j. aleshunas // created on: 9 nov 04 // modified on: 12 nov 04 // // © 2004 John Aleshunas // //*********************************************************************** #include #include #include #include #include using namespace std; //*********************************************************************** // class Cluster_instance declaration //*********************************************************************** class Cluster_instance { // private class variables // private methods public: // public class variables float fAttribute; string sClassification; // public methods Cluster_instance(void); // constructor }; // class Cluster_instance //*********************************************************************** // class Cluster_instance method declarations //*********************************************************************** // class Classification_instance constructor Cluster_instance::Cluster_instance(void){ // make room for the attributes //vfAttributes.resize(iAttribute_ct); return; } //Cluster_instance::Cluster_instance //*********************************************************************** // class Cluster declaration //*********************************************************************** class Cluster { // private class variables // private methods public: // public class variables vector vclThe_cluster; // public methods Cluster(void); // constructor }; // class Cluster //*********************************************************************** // class Cluster method declarations //*********************************************************************** // class Classification_instance constructor Cluster::Cluster(void){ // no code yet return; } //Cluster::Cluster //*********************************************************************** // class Cluster_set declaration //*********************************************************************** class Cluster_set { // private class variables vector vclThe_cluster_set; vector vfMeans; vector vfOld_means; //float fDiff_old; //float fDiff_new; /// control data variables int iK_count; string sIn_file; string sOut_file; float fTolerance; Cluster vclInput_data; int iIteration; // private methods void Read_control_data(void); void Read_input_data(void); void Write_output_data(void); void Setup_cluster_set(void); void Identify_mean_values(void); void Cluster_data(void); void Calculate_cluster_means(void); bool Compare_mean_values(void); public: // public class variables // public methods Cluster_set(void); // constructor void Execute_clustering(void); }; // class Cluster_set //*********************************************************************** // class Cluster_set public method declarations //*********************************************************************** // class Cluster_set constructor Cluster_set::Cluster_set(void){ // local variables // initialize the count of iterations iIteration = 0; return; } //Cluster_set::Cluster_set //*********************************************************************** void Cluster_set::Execute_clustering(void){ // local variables bool bNot_done = true; // read the control file Read_control_data( ); // read the input data Read_input_data( ); // loop until we are done clustering - the mean values don't change while(bNot_done){ // set up the cluster set Setup_cluster_set( ); // // identify the k means values Identify_mean_values( ); // // cluster the input data using the k means values Cluster_data( ); // // calculate the means of the clusters Calculate_cluster_means( ); // // compare the old mean values to the new mean values // if the difference is less than the tolerance value then stop clustering bNot_done = Compare_mean_values( ); // // increment the iteration iIteration++; // end the main loop } // while // write the output data Write_output_data( ); // <--- THIS IS THE NEXT LOCATION TO WORK ON return; } // Cluster_set::Execute_clustering //*********************************************************************** // class Cluster_set private method declarations //*********************************************************************** void Cluster_set::Read_control_data(void) { // declare an input stream to read the data ifstream strInput_stream; // modify the filename string sFilename = "control.dat"; // open the input stream to read the key strInput_stream.open(sFilename.c_str()); // check if the file was OK if (strInput_stream.is_open()){ // read the count of neighbors strInput_stream >> iK_count; // read the filenames strInput_stream >> sIn_file; strInput_stream >> sOut_file; // read the stopping criteria strInput_stream >> fTolerance; } //if else cout << "Error reading the key file!" << endl << endl; // print error message strInput_stream.close(); // close filestream return; } // Cluster_set::Read_control_file //*********************************************************************** void Cluster_set::Read_input_data(void){ // local variables // declare an input stream to read the key ifstream strInput_stream; // modify the filename sIn_file = sIn_file + ".dat"; // declare data storage for the input data Cluster_instance clInput_instance; // open the input stream to read the key strInput_stream.open(sIn_file.c_str()); // check if the file was OK if (strInput_stream.is_open()){ while(!strInput_stream.eof()){ // get the attribute strInput_stream >> clInput_instance.fAttribute; // read the classification name strInput_stream >> clInput_instance.sClassification; // save the data in the vector set //vclInput_data.push_back(clInput_instance); vclInput_data.vclThe_cluster.push_back(clInput_instance); }// while } //if else cout << "Error reading the key file!" << endl << endl; // print error message strInput_stream.close(); // close filestream return; } //Cluster_set::Read_input_data //*********************************************************************** void Cluster_set::Write_output_data(void){ // local variables unsigned uIndex1; int iIndex; // modify the filename sOut_file = sOut_file + "_cl.out"; // declare an output stream ofstream strResults_out_stream; // open the stream to write the output plaintext strResults_out_stream.open(sOut_file.c_str()); // loop thru each cluster for(iIndex = 0; iIndex < iK_count; iIndex++){ // output a cluster header - cluster #, cluster mean strResults_out_stream << "Cluster #" << iIndex + 1 << " with mean " << vfMeans[iIndex] << " and member count " << vclThe_cluster_set[iIndex].vclThe_cluster.size( ) << endl; // loop thru the cluster members for(uIndex1 = 0; uIndex1 < vclThe_cluster_set[iIndex].vclThe_cluster.size( ); uIndex1++){ // output the cluster member data strResults_out_stream << vclThe_cluster_set[iIndex].vclThe_cluster[uIndex1].fAttribute; strResults_out_stream << " "; strResults_out_stream << vclThe_cluster_set[iIndex].vclThe_cluster[uIndex1].sClassification; // output a CR/LF strResults_out_stream << endl; } // for // output CR/LF strResults_out_stream << endl; } // for strResults_out_stream.close(); //for(uIndex2 = 0; uIndex2 < vclInstances[uIndex1].vfAttributes.size(); uIndex2++){ // strResults_out_stream << vclInstances[uIndex1].vfAttributes[uIndex2]; //strResults_out_stream << " "; //} // for // write the plaintext to the output file stream //strResults_out_stream << vclInstances[uIndex1].sClassification; //strResults_out_stream << endl << endl; return; } // Cluster_set::Write_output_data //*********************************************************************** void Cluster_set::Setup_cluster_set(void) { // local variables unsigned uCheck_data = 99; // clear the clusterset memory if(iIteration > 0) vclThe_cluster_set.clear( ); // check the data uCheck_data = vclThe_cluster_set.size( ); // allocate the clusterset memory vclThe_cluster_set.resize(iK_count); // check the data uCheck_data = vclThe_cluster_set.size( ); return; } // Cluster_set::Setup_cluster_set //*********************************************************************** void Cluster_set::Identify_mean_values(void){ // local variables int iIndex; float fCheck_data; if(iIteration < 1) { // if this is the first iteration for(iIndex = 0; iIndex < iK_count; iIndex++){ // designate the first k input data values as the initial means vfMeans.push_back(vclInput_data.vclThe_cluster[iIndex].fAttribute); // initialize the old_means values storage vfOld_means.push_back(vclInput_data.vclThe_cluster[iIndex].fAttribute); } //for } // if // check the data for(iIndex = 0; iIndex < iK_count; iIndex++){ // copy the new_means values into the old_means values storage fCheck_data = vfMeans[iIndex]; } //for return; } //Cluster_set::Identify_mean_values //*********************************************************************** void Cluster_set::Cluster_data(void){ // local variables float fDifference; float fSquared_difference; float fBest_squared_difference; int iBest_index; unsigned uIndex; int iIndex; Cluster_instance clData_instance; // loop for all the input data values for(uIndex = 0; uIndex < vclInput_data.vclThe_cluster.size( ); uIndex++) { // initialize the tracking variables fBest_squared_difference = 999999; // make this erroneously big iBest_index = 999999; // make this erroneously big // get the next data value clData_instance = vclInput_data.vclThe_cluster[uIndex]; // loop thru all of the mean values for(iIndex = 0; iIndex < iK_count; iIndex++) { // calculate the squared difference between the data value and the mean value fDifference = clData_instance.fAttribute - vfMeans[iIndex]; fSquared_difference = fDifference * fDifference; // if this value is less than the best squared difference (BSD) if(fSquared_difference < fBest_squared_difference){ fBest_squared_difference = fSquared_difference; // save it as BSD // and save the this index as best index]] iBest_index = iIndex; } // if } // for // insert the data_instance into the iIndex cluster of the cluster_set vclThe_cluster_set[iBest_index].vclThe_cluster.push_back(clData_instance); } // for // check the data for(iIndex = 0; iIndex < iK_count; iIndex++) { uIndex = vclThe_cluster_set[iIndex].vclThe_cluster.size( ); } // for return; } // Cluster_set::Cluster_data //*********************************************************************** void Cluster_set::Calculate_cluster_means(void){ // local variables float fSum_of_values; int iIndex; unsigned uIndex; float fCluster_means; // save the old mean values for(iIndex = 0; iIndex < iK_count; iIndex++){ // copy the new_means values into the old_means values storage vfOld_means[iIndex] = vfMeans[iIndex]; } //for // loop thru each cluster for(iIndex = 0; iIndex < iK_count; iIndex++){ // initialize the sum_of_values fSum_of_values = 0; // loop thru each value in the cluster for(uIndex = 0; uIndex < vclThe_cluster_set[iIndex].vclThe_cluster.size( ); uIndex++){ // add to the sum fSum_of_values = fSum_of_values + vclThe_cluster_set[iIndex].vclThe_cluster[uIndex].fAttribute; } // for // calulate the mean of the sum fCluster_means = fSum_of_values / (float)uIndex; // save the mean value vfMeans[iIndex] = fCluster_means; } // for return; } // Cluster_set::Calculate_cluster_means //*********************************************************************** bool Cluster_set::Compare_mean_values(void){ // local variables bool bNot_done; float sStep_difference; float fResult_difference = 0; int iIndex; // calculate the difference between the old means and the new means for(iIndex = 0; iIndex < iK_count; iIndex++){ sStep_difference = vfMeans[iIndex] - vfOld_means[iIndex]; // guarantee a positive value without using squares if(sStep_difference < 0) sStep_difference = sStep_difference * (-1); // add the step difference to the total difference of means fResult_difference = fResult_difference + sStep_difference; } // for // stop clustering if there is little change in the mean values if(fResult_difference < fTolerance) bNot_done = false; else bNot_done = true; return bNot_done; } // Cluster_set::Compare_mean_values //*********************************************************************** //*********************************************************************** //*********************************************************************** //*********************************************************************** //*********************************************************************** //*********************************************************************** //***********************************************************************