@article {IOPORT.06064057, author = {Zhang, Yongqing and Zhang, Danling and Mi, Gang and Ma Daichuan and Li, Gongbing and Guo, Yanzhi and Li, Menglong and Zhu, Min}, title = {Using ensemble methods to deal with imbalanced data in predicting protein-protein interactions.}, year = {2012}, journal = {Computational Biology and Chemistry}, volume = {36}, issn = {1476-9271}, pages = {36-41}, publisher = {Elsevier Science Ireland (Pergamon), Shannon}, doi = {10.1016/j.compbiolchem.2011.12.003}, abstract = {Summary: In proteins, the number of interacting pairs is usually much smaller than the number of non-interacting ones. So the imbalanced data problem will arise in the field of protein-protein interactions (PPIs) prediction. We introduce two ensemble methods to solve the imbalanced data problem. These ensemble methods combine the based-cluster under-sampling technique and the fusion classifiers. Then we evaluate the ensemble methods using a data set from the Data Base of Interacting Proteins (DIP) with 10-fold cross validation. All the prediction models achieve the area under the receiver operating characteristic curve (AUC) value about 95\%. Our results show that the ensemble classifiers are quite effective in predicting PPIs; we also gain some valuable conclusions on the performance of ensemble methods for PPIs in imbalanced data. The prediction software and all data sets employed in the work can be obtained for free at \url{http://cic.scu.edu.cn/bioinformatics/Ensemble_PPIs/index.html}.}, identifier = {06064057}, }