BALL  1.4.1
QSARData.h
Go to the documentation of this file.
00001 /* QSARData.h
00002  * 
00003  * Copyright (C) 2009 Marcel Schumann
00004  * 
00005  * This file is part of QuEasy -- A Toolbox for Automated QSAR Model
00006  * Construction and Validation.
00007  * QuEasy is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 3 of the License, or (at
00010  * your option) any later version.
00011  * 
00012  * QuEasy is distributed in the hope that it will be useful, but
00013  * WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * General Public License for more details.
00016  * 
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, see <http://www.gnu.org/licenses/>.
00019  */
00020 
00021 // -*- Mode: C++; tab-width: 2; -*-
00022 // vi: set ts=2:
00023 //
00024 // 
00025 
00026 #ifndef QSARH
00027 #define QSARH
00028 
00029 #include <iostream>
00030 #include <BALL/KERNEL/system.h>
00031 #include <BALL/FORMAT/SDFile.h>
00032 #include <BALL/FORMAT/PDBFile.h>
00033 #include <BALL/FORMAT/HINFile.h>
00034 #include <BALL/FORMAT/MOLFile.h>
00035 #include <vector>
00036 #include <list>
00037 #include <set>
00038 #include <map>
00039 #include <math.h>
00040 #include <sstream>
00041 #include <fstream>
00042 #include <limits>
00043 #include <fstream>
00044 #include <BALL/QSAR/simpleDescriptors.h>
00045 #include <BALL/QSAR/connectivityDescriptors.h>
00046 #include <BALL/QSAR/partialChargeDescriptors.h>
00047 #include <BALL/QSAR/surfaceDescriptors.h>
00048 #include <BALL/COMMON/exception.h>
00049 #include <string.h>
00050 
00051 #ifndef STATISTICS
00052 #include <BALL/QSAR/statistics.h>
00053 #endif
00054 
00055 #ifndef QSAR_EXCEPTION
00056 #include <BALL/QSAR/exception.h>
00057 #endif
00058 
00059 #include <gsl/gsl_randist.h>
00060 #include <gsl/gsl_cdf.h>
00061 
00062 #include <BALL/CONCEPT/timeStamp.h>
00063 
00064 // #ifndef MODEL
00065 // #include "Model.h"
00066 // #endif
00067 
00068 namespace BALL
00069 {
00070   namespace QSAR
00071   {
00072     typedef vector<double> Column;
00073     typedef vector<Column> VMatrix;
00074     
00076     class BALL_EXPORT QSARData 
00077     {
00078       public:
00079         
00080         QSARData();
00081         
00082         ~QSARData();
00083         
00087         bool isDataCentered() const;
00088         
00090         bool isResponseCentered() const;
00091           
00096         vector<String>* readPropertyNames(String sd_file);
00097 
00101         void readSDFile(const char* file);
00102         
00108         void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
00109   
00113         void calculateBALLDescriptors(Molecule& m);
00114           
00116         void displayMatrix();
00117         
00120         void centerData(bool center_Y=0);
00121   
00123         void scaleAllDescriptors();
00124         
00126         unsigned int getNoSubstances() const;
00127         
00129         unsigned int getNoDescriptors() const;
00130         
00138         void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
00139   
00141         void manipulateY(vector<String> v);
00142   
00145         void manipulateY(String v);
00146         
00149         void discretizeY(vector<double> thresholds);
00150         
00151         void transformX(vector<String> v);
00152         
00154         vector<QSARData*> partitionInputData(int p);
00155         
00157         void saveToFile(string filename) const;
00158         
00160         void readFromFile(string filename);
00161         
00164         vector<QSARData*> generateExternalSet(double fraction) const;
00165 
00170         vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
00171         
00173         vector<double>* getSubstance(int s) const;
00174         
00176         vector<double>* getActivity(int s) const;
00177         
00179         unsigned int getNoResponseVariables() const;
00180         
00181         const vector<string>* getSubstanceNames() const;
00182         
00184         bool checkforDiscreteY() const;
00185         
00186         
00188         bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
00189         
00191         void setDataFolder(const char* folder);
00192         
00195         void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
00196         
00202         void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
00204         
00205         
00206       protected:
00207         
00212         void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1);
00213         
00216         void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
00217         
00218         void removeInvalidSubstances(std::multiset<int>& inv);
00219         
00221         void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
00222         
00225         void checkActivityIDs(std::multiset<int>& act, int no_properties);
00226         
00229         void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
00230         
00232         void printMatrix(const VMatrix& mat, std::ostream& out) const;
00234         
00239         VMatrix descriptor_matrix_;
00240         
00242         VMatrix Y_;
00243         
00245         VMatrix descriptor_transformations_;
00246         
00248         VMatrix y_transformations_;
00249       
00251         vector<string> column_names_;
00252   
00254         vector<string> substance_names_;
00255   
00257         std::multiset<int> invalidDescriptors_;
00258         
00259         std::multiset<int> invalidSubstances_;
00260         
00261         String data_folder_;
00262         
00264         std::map<String,int> class_names_;
00266 
00267         
00268         
00269         friend class ClassificationValidation;
00270         friend class RegressionValidation;
00271         friend class Validation;
00272         friend class Model;
00273         friend class FitModel;
00274         friend class FeatureSelection;
00275   
00276     };
00277 
00278   }
00279 }
00280 
00281 #endif // QSARH
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Defines