001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.correlation;
018    
019    import org.apache.commons.math.MathException;
020    import org.apache.commons.math.MathRuntimeException;
021    import org.apache.commons.math.distribution.TDistribution;
022    import org.apache.commons.math.distribution.TDistributionImpl;
023    import org.apache.commons.math.exception.util.LocalizedFormats;
024    import org.apache.commons.math.exception.NullArgumentException;
025    import org.apache.commons.math.exception.DimensionMismatchException;
026    import org.apache.commons.math.linear.RealMatrix;
027    import org.apache.commons.math.linear.BlockRealMatrix;
028    import org.apache.commons.math.stat.regression.SimpleRegression;
029    import org.apache.commons.math.util.FastMath;
030    
031    /**
032     * Computes Pearson's product-moment correlation coefficients for pairs of arrays
033     * or columns of a matrix.
034     *
035     * <p>The constructors that take <code>RealMatrix</code> or
036     * <code>double[][]</code> arguments generate correlation matrices.  The
037     * columns of the input matrices are assumed to represent variable values.
038     * Correlations are given by the formula</p>
039     * <code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
040     * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
041     * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
042     *
043     * @version $Revision: 990655 $ $Date: 2010-08-29 23:49:40 +0200 (dim. 29 ao??t 2010) $
044     * @since 2.0
045     */
046    public class PearsonsCorrelation {
047    
048        /** correlation matrix */
049        private final RealMatrix correlationMatrix;
050    
051        /** number of observations */
052        private final int nObs;
053    
054        /**
055         * Create a PearsonsCorrelation instance without data
056         */
057        public PearsonsCorrelation() {
058            super();
059            correlationMatrix = null;
060            nObs = 0;
061        }
062    
063        /**
064         * Create a PearsonsCorrelation from a rectangular array
065         * whose columns represent values of variables to be correlated.
066         *
067         * @param data rectangular array with columns representing variables
068         * @throws IllegalArgumentException if the input data array is not
069         * rectangular with at least two rows and two columns.
070         */
071        public PearsonsCorrelation(double[][] data) {
072            this(new BlockRealMatrix(data));
073        }
074    
075        /**
076         * Create a PearsonsCorrelation from a RealMatrix whose columns
077         * represent variables to be correlated.
078         *
079         * @param matrix matrix with columns representing variables to correlate
080         */
081        public PearsonsCorrelation(RealMatrix matrix) {
082            checkSufficientData(matrix);
083            nObs = matrix.getRowDimension();
084            correlationMatrix = computeCorrelationMatrix(matrix);
085        }
086    
087        /**
088         * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
089         * matrix is computed by scaling the Covariance's covariance matrix.
090         * The Covariance instance must have been created from a data matrix with
091         * columns representing variable values.
092         *
093         * @param covariance Covariance instance
094         */
095        public PearsonsCorrelation(Covariance covariance) {
096            RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
097            if (covarianceMatrix == null) {
098                throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
099            }
100            nObs = covariance.getN();
101            correlationMatrix = covarianceToCorrelation(covarianceMatrix);
102        }
103    
104        /**
105         * Create a PearsonsCorrelation from a covariance matrix.  The correlation
106         * matrix is computed by scaling the covariance matrix.
107         *
108         * @param covarianceMatrix covariance matrix
109         * @param numberOfObservations the number of observations in the dataset used to compute
110         * the covariance matrix
111         */
112        public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
113            nObs = numberOfObservations;
114            correlationMatrix = covarianceToCorrelation(covarianceMatrix);
115    
116        }
117    
118        /**
119         * Returns the correlation matrix
120         *
121         * @return correlation matrix
122         */
123        public RealMatrix getCorrelationMatrix() {
124            return correlationMatrix;
125        }
126    
127        /**
128         * Returns a matrix of standard errors associated with the estimates
129         * in the correlation matrix.<br/>
130         * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
131         * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
132         * <p>The formula used to compute the standard error is <br/>
133         * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
134         * where <code>r</code> is the estimated correlation coefficient and
135         * <code>n</code> is the number of observations in the source dataset.</p>
136         *
137         * @return matrix of correlation standard errors
138         */
139        public RealMatrix getCorrelationStandardErrors() {
140            int nVars = correlationMatrix.getColumnDimension();
141            double[][] out = new double[nVars][nVars];
142            for (int i = 0; i < nVars; i++) {
143                for (int j = 0; j < nVars; j++) {
144                    double r = correlationMatrix.getEntry(i, j);
145                    out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
146                }
147            }
148            return new BlockRealMatrix(out);
149        }
150    
151        /**
152         * Returns a matrix of p-values associated with the (two-sided) null
153         * hypothesis that the corresponding correlation coefficient is zero.
154         * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
155         * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
156         * a value with absolute value greater than or equal to <br>
157         * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
158         * <p>The values in the matrix are sometimes referred to as the
159         * <i>significance</i> of the corresponding correlation coefficients.</p>
160         *
161         * @return matrix of p-values
162         * @throws MathException if an error occurs estimating probabilities
163         */
164        public RealMatrix getCorrelationPValues() throws MathException {
165            TDistribution tDistribution = new TDistributionImpl(nObs - 2);
166            int nVars = correlationMatrix.getColumnDimension();
167            double[][] out = new double[nVars][nVars];
168            for (int i = 0; i < nVars; i++) {
169                for (int j = 0; j < nVars; j++) {
170                    if (i == j) {
171                        out[i][j] = 0d;
172                    } else {
173                        double r = correlationMatrix.getEntry(i, j);
174                        double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
175                        out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
176                    }
177                }
178            }
179            return new BlockRealMatrix(out);
180        }
181    
182    
183        /**
184         * Computes the correlation matrix for the columns of the
185         * input matrix.
186         *
187         * @param matrix matrix with columns representing variables to correlate
188         * @return correlation matrix
189         */
190        public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
191            int nVars = matrix.getColumnDimension();
192            RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
193            for (int i = 0; i < nVars; i++) {
194                for (int j = 0; j < i; j++) {
195                  double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
196                  outMatrix.setEntry(i, j, corr);
197                  outMatrix.setEntry(j, i, corr);
198                }
199                outMatrix.setEntry(i, i, 1d);
200            }
201            return outMatrix;
202        }
203    
204        /**
205         * Computes the correlation matrix for the columns of the
206         * input rectangular array.  The colums of the array represent values
207         * of variables to be correlated.
208         *
209         * @param data matrix with columns representing variables to correlate
210         * @return correlation matrix
211         */
212        public RealMatrix computeCorrelationMatrix(double[][] data) {
213           return computeCorrelationMatrix(new BlockRealMatrix(data));
214        }
215    
216        /**
217         * Computes the Pearson's product-moment correlation coefficient between the two arrays.
218         *
219         * </p>Throws IllegalArgumentException if the arrays do not have the same length
220         * or their common length is less than 2</p>
221         *
222         * @param xArray first data array
223         * @param yArray second data array
224         * @return Returns Pearson's correlation coefficient for the two arrays
225         * @throws  IllegalArgumentException if the arrays lengths do not match or
226         * there is insufficient data
227         */
228        public double correlation(final double[] xArray, final double[] yArray) throws IllegalArgumentException {
229            SimpleRegression regression = new SimpleRegression();
230            if (xArray.length != yArray.length) {
231                throw new DimensionMismatchException(xArray.length, yArray.length);
232            } else if (xArray.length < 2) {
233                throw MathRuntimeException.createIllegalArgumentException(
234                      LocalizedFormats.INSUFFICIENT_DIMENSION, xArray.length, 2);
235            } else {
236                for(int i=0; i<xArray.length; i++) {
237                    regression.addData(xArray[i], yArray[i]);
238                }
239                return regression.getR();
240            }
241        }
242    
243        /**
244         * Derives a correlation matrix from a covariance matrix.
245         *
246         * <p>Uses the formula <br/>
247         * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
248         * <code>r(&middot,&middot;)</code> is the correlation coefficient and
249         * <code>s(&middot;)</code> means standard deviation.</p>
250         *
251         * @param covarianceMatrix the covariance matrix
252         * @return correlation matrix
253         */
254        public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
255            int nVars = covarianceMatrix.getColumnDimension();
256            RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
257            for (int i = 0; i < nVars; i++) {
258                double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
259                outMatrix.setEntry(i, i, 1d);
260                for (int j = 0; j < i; j++) {
261                    double entry = covarianceMatrix.getEntry(i, j) /
262                           (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
263                    outMatrix.setEntry(i, j, entry);
264                    outMatrix.setEntry(j, i, entry);
265                }
266            }
267            return outMatrix;
268        }
269    
270        /**
271         * Throws IllegalArgumentException of the matrix does not have at least
272         * two columns and two rows
273         *
274         * @param matrix matrix to check for sufficiency
275         */
276        private void checkSufficientData(final RealMatrix matrix) {
277            int nRows = matrix.getRowDimension();
278            int nCols = matrix.getColumnDimension();
279            if (nRows < 2 || nCols < 2) {
280                throw MathRuntimeException.createIllegalArgumentException(
281                        LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
282                        nRows, nCols);
283            }
284        }
285    }