View Javadoc
1   /*
2    * Copyright (C) 2012 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5    * in compliance with the License. You may obtain a copy of the License at
6    *
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software distributed under the License
10   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11   * or implied. See the License for the specific language governing permissions and limitations under
12   * the License.
13   */
14  
15  package com.google.common.math;
16  
17  import static com.google.common.base.Preconditions.checkArgument;
18  import static com.google.common.base.Preconditions.checkNotNull;
19  import static com.google.common.base.Preconditions.checkState;
20  import static java.lang.Double.NaN;
21  import static java.lang.Double.doubleToLongBits;
22  import static java.lang.Double.isNaN;
23  
24  import com.google.common.annotations.Beta;
25  import com.google.common.annotations.GwtIncompatible;
26  import com.google.common.base.MoreObjects;
27  import com.google.common.base.Objects;
28  import java.io.Serializable;
29  import java.nio.ByteBuffer;
30  import java.nio.ByteOrder;
31  import javax.annotation.Nullable;
32  
33  /**
34   * An immutable value object capturing some basic statistics about a collection of paired double
35   * values (e.g. points on a plane). Build instances with {@link PairedStatsAccumulator#snapshot}.
36   *
37   * @author Pete Gillin
38   * @since 20.0
39   */
40  @Beta
41  @GwtIncompatible
42  public final class PairedStats implements Serializable {
43  
44    private final Stats xStats;
45    private final Stats yStats;
46    private final double sumOfProductsOfDeltas;
47  
48    /**
49     * Internal constructor. Users should use {@link PairedStatsAccumulator#snapshot}.
50     *
51     * <p>To ensure that the created instance obeys its contract, the parameters should satisfy the
52     * following constraints. This is the callers responsibility and is not enforced here.
53     *
54     * <ul>
55     *   <li>Both {@code xStats} and {@code yStats} must have the same {@code count}.
56     *   <li>If that {@code count} is 1, {@code sumOfProductsOfDeltas} must be exactly 0.0.
57     *   <li>If that {@code count} is more than 1, {@code sumOfProductsOfDeltas} must be finite.
58     * </ul>
59     */
60    PairedStats(Stats xStats, Stats yStats, double sumOfProductsOfDeltas) {
61      this.xStats = xStats;
62      this.yStats = yStats;
63      this.sumOfProductsOfDeltas = sumOfProductsOfDeltas;
64    }
65  
66    /**
67     * Returns the number of pairs in the dataset.
68     */
69    public long count() {
70      return xStats.count();
71    }
72  
73    /**
74     * Returns the statistics on the {@code x} values alone.
75     */
76    public Stats xStats() {
77      return xStats;
78    }
79  
80    /**
81     * Returns the statistics on the {@code y} values alone.
82     */
83    public Stats yStats() {
84      return yStats;
85    }
86  
87    /**
88     * Returns the population covariance of the values. The count must be non-zero.
89     *
90     * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It
91     * is not guaranteed to return zero when the dataset consists of the same pair of values multiple
92     * times, due to numerical errors.
93     *
94     * <h3>Non-finite values</h3>
95     *
96     * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
97     * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
98     *
99     * @throws IllegalStateException if the dataset is empty
100    */
101   public double populationCovariance() {
102     checkState(count() != 0);
103     return sumOfProductsOfDeltas / count();
104   }
105 
106   /**
107    * Returns the sample covariance of the values. The count must be greater than one.
108    *
109    * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values
110    * multiple times, due to numerical errors.
111    *
112    * <h3>Non-finite values</h3>
113    *
114    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
115    * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
116    *
117    * @throws IllegalStateException if the dataset is empty or contains a single pair of values
118    */
119   public double sampleCovariance() {
120     checkState(count() > 1);
121     return sumOfProductsOfDeltas / (count() - 1);
122   }
123 
124   /**
125    * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or
126    * product-moment correlation coefficient</a> of the values. The count must greater than one, and
127    * the {@code x} and {@code y} values must both have non-zero population variance (i.e.
128    * {@code xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result
129    * is not guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
130    * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
131    *
132    * <h3>Non-finite values</h3>
133    *
134    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
135    * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
136    *
137    * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
138    *     either the {@code x} and {@code y} dataset has zero population variance
139    */
140   public double pearsonsCorrelationCoefficient() {
141     checkState(count() > 1);
142     if (isNaN(sumOfProductsOfDeltas)) {
143       return NaN;
144     }
145     double xSumOfSquaresOfDeltas = xStats().sumOfSquaresOfDeltas();
146     double ySumOfSquaresOfDeltas = yStats().sumOfSquaresOfDeltas();
147     checkState(xSumOfSquaresOfDeltas > 0.0);
148     checkState(ySumOfSquaresOfDeltas > 0.0);
149     // The product of two positive numbers can be zero if the multiplication underflowed. We
150     // force a positive value by effectively rounding up to MIN_VALUE.
151     double productOfSumsOfSquaresOfDeltas =
152         ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
153     return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
154   }
155 
156   /**
157    * Returns a linear transformation giving the best fit to the data according to
158    * <a href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear
159    * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and
160    * either the {@code x} or {@code y} data must have a non-zero population variance (i.e.
161    * {@code xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result
162    * is guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
163    * data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
164    *
165    * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
166    * error is defined as the square root of the mean of the squares of the differences between the
167    * actual {@code y} values of the data and the values predicted by the fit for the {@code x}
168    * values (i.e. it is the square root of the mean of the squares of the vertical distances between
169    * the data points and the best fit line). For this fit, this error is a fraction
170    * {@code sqrt(1 - R*R)} of the population standard deviation of {@code y}, where {@code R} is the
171    * Pearson's correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
172    *
173    * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
174    * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
175    * does not normally minimize that error: to do that, you should swap the roles of {@code x} and
176    * {@code y}.
177    *
178    * <h3>Non-finite values</h3>
179    *
180    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY},
181    * {@link Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is
182    * {@link LinearTransformation#forNaN()}.
183    *
184    * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
185    *     both the {@code x} and {@code y} dataset must have zero population variance
186    */
187   public LinearTransformation leastSquaresFit() {
188     checkState(count() > 1);
189     if (isNaN(sumOfProductsOfDeltas)) {
190       return LinearTransformation.forNaN();
191     }
192     double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
193     if (xSumOfSquaresOfDeltas > 0.0) {
194       if (yStats.sumOfSquaresOfDeltas() > 0.0) {
195         return LinearTransformation.mapping(xStats.mean(), yStats.mean())
196             .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
197       } else {
198         return LinearTransformation.horizontal(yStats.mean());
199       }
200     } else {
201       checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
202       return LinearTransformation.vertical(xStats.mean());
203     }
204   }
205 
206   /**
207    * {@inheritDoc}
208    *
209    * <p><b>Note:</b> This tests exact equality of the calculated statistics, including the floating
210    * point values. Two instances are guaranteed to be considered equal if one is copied from the
211    * other using {@code second = new PairedStatsAccumulator().addAll(first).snapshot()}, if both
212    * were obtained by calling {@code snapshot()} on the same {@link PairedStatsAccumulator} without
213    * adding any values in between the two calls, or if one is obtained from the other after
214    * round-tripping through java serialization. However, floating point rounding errors mean that it
215    * may be false for some instances where the statistics are mathematically equal, including
216    * instances constructed from the same values in a different order... or (in the general case)
217    * even in the same order. (It is guaranteed to return true for instances constructed from the
218    * same values in the same order if {@code strictfp} is in effect, or if the system architecture
219    * guarantees {@code strictfp}-like semantics.)
220    */
221   @Override
222   public boolean equals(@Nullable Object obj) {
223     if (obj == null) {
224       return false;
225     }
226     if (getClass() != obj.getClass()) {
227       return false;
228     }
229     PairedStats other = (PairedStats) obj;
230     return (xStats.equals(other.xStats))
231         && (yStats.equals(other.yStats))
232         && (doubleToLongBits(sumOfProductsOfDeltas)
233             == doubleToLongBits(other.sumOfProductsOfDeltas));
234   }
235 
236   /**
237    * {@inheritDoc}
238    *
239    * <p><b>Note:</b> This hash code is consistent with exact equality of the calculated statistics,
240    * including the floating point values. See the note on {@link #equals} for details.
241    */
242   @Override
243   public int hashCode() {
244     return Objects.hashCode(xStats, yStats, sumOfProductsOfDeltas);
245   }
246 
247   @Override
248   public String toString() {
249     if (count() > 0) {
250       return MoreObjects.toStringHelper(this)
251           .add("xStats", xStats)
252           .add("yStats", yStats)
253           .add("populationCovariance", populationCovariance())
254           .toString();
255     } else {
256       return MoreObjects.toStringHelper(this)
257           .add("xStats", xStats)
258           .add("yStats", yStats)
259           .toString();
260     }
261   }
262 
263   double sumOfProductsOfDeltas() {
264     return sumOfProductsOfDeltas;
265   }
266 
267   private static double ensurePositive(double value) {
268     if (value > 0.0) {
269       return value;
270     } else {
271       return Double.MIN_VALUE;
272     }
273   }
274 
275   private static double ensureInUnitRange(double value) {
276     if (value >= 1.0) {
277       return 1.0;
278     }
279     if (value <= -1.0) {
280       return -1.0;
281     }
282     return value;
283   }
284 
285   // Serialization helpers
286 
287   /**
288    * The size of byte array representation in bytes.
289    */
290   private static final int BYTES = Stats.BYTES * 2 + Double.SIZE / Byte.SIZE;
291 
292   /**
293    * Gets a byte array representation of this instance.
294    *
295    * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
296    * versions.
297    */
298   public byte[] toByteArray() {
299     ByteBuffer buffer = ByteBuffer.allocate(BYTES).order(ByteOrder.LITTLE_ENDIAN);
300     xStats.writeTo(buffer);
301     yStats.writeTo(buffer);
302     buffer.putDouble(sumOfProductsOfDeltas);
303     return buffer.array();
304   }
305 
306   /**
307    * Creates a {@link PairedStats} instance from the given byte representation which was obtained by
308    * {@link #toByteArray}.
309    *
310    * <p><b>Note:</b> No guarantees are made regarding stability of the representation between
311    * versions.
312    */
313   public static PairedStats fromByteArray(byte[] byteArray) {
314     checkNotNull(byteArray);
315     checkArgument(
316         byteArray.length == BYTES,
317         "Expected PairedStats.BYTES = %s, got %s",
318         BYTES,
319         byteArray.length);
320     ByteBuffer buffer = ByteBuffer.wrap(byteArray).order(ByteOrder.LITTLE_ENDIAN);
321     Stats xStats = Stats.readFrom(buffer);
322     Stats yStats = Stats.readFrom(buffer);
323     double sumOfProductsOfDeltas = buffer.getDouble();
324     return new PairedStats(xStats, yStats, sumOfProductsOfDeltas);
325   }
326 
327   private static final long serialVersionUID = 0;
328 }