diff -r efd9c589177a -r c0b4a8b5a012 toolkit/javascript/d3/lib/science/science.stats.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolkit/javascript/d3/lib/science/science.stats.js Thu Apr 10 14:20:23 2014 +0200 @@ -0,0 +1,720 @@ +(function(){science.stats = {}; +// Bandwidth selectors for Gaussian kernels. +// Based on R's implementations in `stats.bw`. +science.stats.bandwidth = { + + // Silverman, B. W. (1986) Density Estimation. London: Chapman and Hall. + nrd0: function(x) { + var hi = Math.sqrt(science.stats.variance(x)); + if (!(lo = Math.min(hi, science.stats.iqr(x) / 1.34))) + (lo = hi) || (lo = Math.abs(x[1])) || (lo = 1); + return .9 * lo * Math.pow(x.length, -.2); + }, + + // Scott, D. W. (1992) Multivariate Density Estimation: Theory, Practice, and + // Visualization. Wiley. + nrd: function(x) { + var h = science.stats.iqr(x) / 1.34; + return 1.06 * Math.min(Math.sqrt(science.stats.variance(x)), h) + * Math.pow(x.length, -1/5); + } +}; +science.stats.distance = { + euclidean: function(a, b) { + var n = a.length, + i = -1, + s = 0, + x; + while (++i < n) { + x = a[i] - b[i]; + s += x * x; + } + return Math.sqrt(s); + }, + manhattan: function(a, b) { + var n = a.length, + i = -1, + s = 0; + while (++i < n) s += Math.abs(a[i] - b[i]); + return s; + }, + minkowski: function(p) { + return function(a, b) { + var n = a.length, + i = -1, + s = 0; + while (++i < n) s += Math.pow(Math.abs(a[i] - b[i]), p); + return Math.pow(s, 1 / p); + }; + }, + chebyshev: function(a, b) { + var n = a.length, + i = -1, + max = 0, + x; + while (++i < n) { + x = Math.abs(a[i] - b[i]); + if (x > max) max = x; + } + return max; + }, + hamming: function(a, b) { + var n = a.length, + i = -1, + d = 0; + while (++i < n) if (a[i] !== b[i]) d++; + return d; + }, + jaccard: function(a, b) { + var n = a.length, + i = -1, + s = 0; + while (++i < n) if (a[i] === b[i]) s++; + return s / n; + }, + braycurtis: function(a, b) { + var n = a.length, + i = -1, + s0 = 0, + s1 = 0, + ai, + bi; + while (++i < n) { + ai = a[i]; + bi = b[i]; + s0 += Math.abs(ai - bi); + s1 += Math.abs(ai + bi); + } + return s0 / s1; + } +}; +// Based on implementation in http://picomath.org/. +science.stats.erf = function(x) { + var a1 = 0.254829592, + a2 = -0.284496736, + a3 = 1.421413741, + a4 = -1.453152027, + a5 = 1.061405429, + p = 0.3275911; + + // Save the sign of x + var sign = x < 0 ? -1 : 1; + if (x < 0) { + sign = -1; + x = -x; + } + + // A&S formula 7.1.26 + var t = 1 / (1 + p * x); + return sign * ( + 1 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) + * t * Math.exp(-x * x)); +}; +science.stats.phi = function(x) { + return .5 * (1 + science.stats.erf(x / Math.SQRT2)); +}; +// See . +science.stats.kernel = { + uniform: function(u) { + if (u <= 1 && u >= -1) return .5; + return 0; + }, + triangular: function(u) { + if (u <= 1 && u >= -1) return 1 - Math.abs(u); + return 0; + }, + epanechnikov: function(u) { + if (u <= 1 && u >= -1) return .75 * (1 - u * u); + return 0; + }, + quartic: function(u) { + if (u <= 1 && u >= -1) { + var tmp = 1 - u * u; + return (15 / 16) * tmp * tmp; + } + return 0; + }, + triweight: function(u) { + if (u <= 1 && u >= -1) { + var tmp = 1 - u * u; + return (35 / 32) * tmp * tmp * tmp; + } + return 0; + }, + gaussian: function(u) { + return 1 / Math.sqrt(2 * Math.PI) * Math.exp(-.5 * u * u); + }, + cosine: function(u) { + if (u <= 1 && u >= -1) return Math.PI / 4 * Math.cos(Math.PI / 2 * u); + return 0; + } +}; +// http://exploringdata.net/den_trac.htm +science.stats.kde = function() { + var kernel = science.stats.kernel.gaussian, + sample = [], + bandwidth = science.stats.bandwidth.nrd; + + function kde(points, i) { + var bw = bandwidth.call(this, sample); + return points.map(function(x) { + var i = -1, + y = 0, + n = sample.length; + while (++i < n) { + y += kernel((x - sample[i]) / bw); + } + return [x, y / bw / n]; + }); + } + + kde.kernel = function(x) { + if (!arguments.length) return kernel; + kernel = x; + return kde; + }; + + kde.sample = function(x) { + if (!arguments.length) return sample; + sample = x; + return kde; + }; + + kde.bandwidth = function(x) { + if (!arguments.length) return bandwidth; + bandwidth = science.functor(x); + return kde; + }; + + return kde; +}; +// Based on figue implementation by Jean-Yves Delort. +// http://code.google.com/p/figue/ +science.stats.kmeans = function() { + var distance = science.stats.distance.euclidean, + maxIterations = 1000, + k = 1; + + function kmeans(vectors) { + var n = vectors.length, + assignments = [], + clusterSizes = [], + repeat = 1, + iterations = 0, + centroids = science_stats_kmeansRandom(k, vectors), + newCentroids, + i, + j, + x, + d, + min, + best; + + while (repeat && iterations < maxIterations) { + // Assignment step. + j = -1; while (++j < k) { + clusterSizes[j] = 0; + } + + i = -1; while (++i < n) { + x = vectors[i]; + min = Infinity; + j = -1; while (++j < k) { + d = distance.call(this, centroids[j], x); + if (d < min) { + min = d; + best = j; + } + } + clusterSizes[assignments[i] = best]++; + } + + // Update centroids step. + newCentroids = []; + i = -1; while (++i < n) { + x = assignments[i]; + d = newCentroids[x]; + if (d == null) newCentroids[x] = vectors[i].slice(); + else { + j = -1; while (++j < d.length) { + d[j] += vectors[i][j]; + } + } + } + j = -1; while (++j < k) { + x = newCentroids[j]; + d = 1 / clusterSizes[j]; + i = -1; while (++i < x.length) x[i] *= d; + } + + // Check convergence. + repeat = 0; + j = -1; while (++j < k) { + if (!science_stats_kmeansCompare(newCentroids[j], centroids[j])) { + repeat = 1; + break; + } + } + centroids = newCentroids; + iterations++; + } + return {assignments: assignments, centroids: centroids}; + } + + kmeans.k = function(x) { + if (!arguments.length) return k; + k = x; + return kmeans; + }; + + kmeans.distance = function(x) { + if (!arguments.length) return distance; + distance = x; + return kmeans; + }; + + return kmeans; +}; + +function science_stats_kmeansCompare(a, b) { + if (!a || !b || a.length !== b.length) return false; + var n = a.length, + i = -1; + while (++i < n) if (a[i] !== b[i]) return false; + return true; +} + +// Returns an array of k distinct vectors randomly selected from the input +// array of vectors. Returns null if k > n or if there are less than k distinct +// objects in vectors. +function science_stats_kmeansRandom(k, vectors) { + var n = vectors.length; + if (k > n) return null; + + var selected_vectors = []; + var selected_indices = []; + var tested_indices = {}; + var tested = 0; + var selected = 0; + var i, + vector, + select; + + while (selected < k) { + if (tested === n) return null; + + var random_index = Math.floor(Math.random() * n); + if (random_index in tested_indices) continue; + + tested_indices[random_index] = 1; + tested++; + vector = vectors[random_index]; + select = true; + for (i = 0; i < selected; i++) { + if (science_stats_kmeansCompare(vector, selected_vectors[i])) { + select = false; + break; + } + } + if (select) { + selected_vectors[selected] = vector; + selected_indices[selected] = random_index; + selected++; + } + } + return selected_vectors; +} +science.stats.hcluster = function() { + var distance = science.stats.distance.euclidean, + linkage = "simple"; // simple, complete or average + + function hcluster(vectors) { + var n = vectors.length, + dMin = [], + cSize = [], + distMatrix = [], + clusters = [], + c1, + c2, + c1Cluster, + c2Cluster, + p, + root, + i, + j; + + // Initialise distance matrix and vector of closest clusters. + i = -1; while (++i < n) { + dMin[i] = 0; + distMatrix[i] = []; + j = -1; while (++j < n) { + distMatrix[i][j] = i === j ? Infinity : distance(vectors[i] , vectors[j]); + if (distMatrix[i][dMin[i]] > distMatrix[i][j]) dMin[i] = j; + } + } + + // create leaves of the tree + i = -1; while (++i < n) { + clusters[i] = []; + clusters[i][0] = { + left: null, + right: null, + dist: 0, + centroid: vectors[i], + size: 1, + depth: 0 + }; + cSize[i] = 1; + } + + // Main loop + for (p = 0; p < n-1; p++) { + // find the closest pair of clusters + c1 = 0; + for (i = 0; i < n; i++) { + if (distMatrix[i][dMin[i]] < distMatrix[c1][dMin[c1]]) c1 = i; + } + c2 = dMin[c1]; + + // create node to store cluster info + c1Cluster = clusters[c1][0]; + c2Cluster = clusters[c2][0]; + + newCluster = { + left: c1Cluster, + right: c2Cluster, + dist: distMatrix[c1][c2], + centroid: calculateCentroid(c1Cluster.size, c1Cluster.centroid, + c2Cluster.size, c2Cluster.centroid), + size: c1Cluster.size + c2Cluster.size, + depth: 1 + Math.max(c1Cluster.depth, c2Cluster.depth) + }; + clusters[c1].splice(0, 0, newCluster); + cSize[c1] += cSize[c2]; + + // overwrite row c1 with respect to the linkage type + for (j = 0; j < n; j++) { + switch (linkage) { + case "single": + if (distMatrix[c1][j] > distMatrix[c2][j]) + distMatrix[j][c1] = distMatrix[c1][j] = distMatrix[c2][j]; + break; + case "complete": + if (distMatrix[c1][j] < distMatrix[c2][j]) + distMatrix[j][c1] = distMatrix[c1][j] = distMatrix[c2][j]; + break; + case "average": + distMatrix[j][c1] = distMatrix[c1][j] = (cSize[c1] * distMatrix[c1][j] + cSize[c2] * distMatrix[c2][j]) / (cSize[c1] + cSize[j]); + break; + } + } + distMatrix[c1][c1] = Infinity; + + // infinity ­out old row c2 and column c2 + for (i = 0; i < n; i++) + distMatrix[i][c2] = distMatrix[c2][i] = Infinity; + + // update dmin and replace ones that previous pointed to c2 to point to c1 + for (j = 0; j < n; j++) { + if (dMin[j] == c2) dMin[j] = c1; + if (distMatrix[c1][j] < distMatrix[c1][dMin[c1]]) dMin[c1] = j; + } + + // keep track of the last added cluster + root = newCluster; + } + + return root; + } + + hcluster.distance = function(x) { + if (!arguments.length) return distance; + distance = x; + return hcluster; + }; + + return hcluster; +}; + +function calculateCentroid(c1Size, c1Centroid, c2Size, c2Centroid) { + var newCentroid = [], + newSize = c1Size + c2Size, + n = c1Centroid.length, + i = -1; + while (++i < n) { + newCentroid[i] = (c1Size * c1Centroid[i] + c2Size * c2Centroid[i]) / newSize; + } + return newCentroid; +} +science.stats.iqr = function(x) { + var quartiles = science.stats.quantiles(x, [.25, .75]); + return quartiles[1] - quartiles[0]; +}; +// Based on org.apache.commons.math.analysis.interpolation.LoessInterpolator +// from http://commons.apache.org/math/ +science.stats.loess = function() { + var bandwidth = .3, + robustnessIters = 2, + accuracy = 1e-12; + + function smooth(xval, yval, weights) { + var n = xval.length, + i; + + if (n !== yval.length) throw {error: "Mismatched array lengths"}; + if (n == 0) throw {error: "At least one point required."}; + + if (arguments.length < 3) { + weights = []; + i = -1; while (++i < n) weights[i] = 1; + } + + science_stats_loessFiniteReal(xval); + science_stats_loessFiniteReal(yval); + science_stats_loessFiniteReal(weights); + science_stats_loessStrictlyIncreasing(xval); + + if (n == 1) return [yval[0]]; + if (n == 2) return [yval[0], yval[1]]; + + var bandwidthInPoints = Math.floor(bandwidth * n); + + if (bandwidthInPoints < 2) throw {error: "Bandwidth too small."}; + + var res = [], + residuals = [], + robustnessWeights = []; + + // Do an initial fit and 'robustnessIters' robustness iterations. + // This is equivalent to doing 'robustnessIters+1' robustness iterations + // starting with all robustness weights set to 1. + i = -1; while (++i < n) { + res[i] = 0; + residuals[i] = 0; + robustnessWeights[i] = 1; + } + + var iter = -1; + while (++iter <= robustnessIters) { + var bandwidthInterval = [0, bandwidthInPoints - 1]; + // At each x, compute a local weighted linear regression + var x; + i = -1; while (++i < n) { + x = xval[i]; + + // Find out the interval of source points on which + // a regression is to be made. + if (i > 0) { + science_stats_loessUpdateBandwidthInterval(xval, weights, i, bandwidthInterval); + } + + var ileft = bandwidthInterval[0], + iright = bandwidthInterval[1]; + + // Compute the point of the bandwidth interval that is + // farthest from x + var edge = (xval[i] - xval[ileft]) > (xval[iright] - xval[i]) ? ileft : iright; + + // Compute a least-squares linear fit weighted by + // the product of robustness weights and the tricube + // weight function. + // See http://en.wikipedia.org/wiki/Linear_regression + // (section "Univariate linear case") + // and http://en.wikipedia.org/wiki/Weighted_least_squares + // (section "Weighted least squares") + var sumWeights = 0, + sumX = 0, + sumXSquared = 0, + sumY = 0, + sumXY = 0, + denom = Math.abs(1 / (xval[edge] - x)); + + for (var k = ileft; k <= iright; ++k) { + var xk = xval[k], + yk = yval[k], + dist = k < i ? x - xk : xk - x, + w = science_stats_loessTricube(dist * denom) * robustnessWeights[k] * weights[k], + xkw = xk * w; + sumWeights += w; + sumX += xkw; + sumXSquared += xk * xkw; + sumY += yk * w; + sumXY += yk * xkw; + } + + var meanX = sumX / sumWeights, + meanY = sumY / sumWeights, + meanXY = sumXY / sumWeights, + meanXSquared = sumXSquared / sumWeights; + + var beta = (Math.sqrt(Math.abs(meanXSquared - meanX * meanX)) < accuracy) + ? 0 : ((meanXY - meanX * meanY) / (meanXSquared - meanX * meanX)); + + var alpha = meanY - beta * meanX; + + res[i] = beta * x + alpha; + residuals[i] = Math.abs(yval[i] - res[i]); + } + + // No need to recompute the robustness weights at the last + // iteration, they won't be needed anymore + if (iter === robustnessIters) { + break; + } + + // Recompute the robustness weights. + + // Find the median residual. + var sortedResiduals = residuals.slice(); + sortedResiduals.sort(); + var medianResidual = sortedResiduals[Math.floor(n / 2)]; + + if (Math.abs(medianResidual) < accuracy) + break; + + var arg, + w; + i = -1; while (++i < n) { + arg = residuals[i] / (6 * medianResidual); + robustnessWeights[i] = (arg >= 1) ? 0 : ((w = 1 - arg * arg) * w); + } + } + + return res; + } + + smooth.bandwidth = function(x) { + if (!arguments.length) return x; + bandwidth = x; + return smooth; + }; + + smooth.robustnessIterations = function(x) { + if (!arguments.length) return x; + robustnessIters = x; + return smooth; + }; + + smooth.accuracy = function(x) { + if (!arguments.length) return x; + accuracy = x; + return smooth; + }; + + return smooth; +}; + +function science_stats_loessFiniteReal(values) { + var n = values.length, + i = -1; + + while (++i < n) if (!isFinite(values[i])) return false; + + return true; +} + +function science_stats_loessStrictlyIncreasing(xval) { + var n = xval.length, + i = 0; + + while (++i < n) if (xval[i - 1] >= xval[i]) return false; + + return true; +} + +// Compute the tricube weight function. +// http://en.wikipedia.org/wiki/Local_regression#Weight_function +function science_stats_loessTricube(x) { + return (x = 1 - x * x * x) * x * x; +} + +// Given an index interval into xval that embraces a certain number of +// points closest to xval[i-1], update the interval so that it embraces +// the same number of points closest to xval[i], ignoring zero weights. +function science_stats_loessUpdateBandwidthInterval( + xval, weights, i, bandwidthInterval) { + + var left = bandwidthInterval[0], + right = bandwidthInterval[1]; + + // The right edge should be adjusted if the next point to the right + // is closer to xval[i] than the leftmost point of the current interval + var nextRight = science_stats_loessNextNonzero(weights, right); + if ((nextRight < xval.length) && (xval[nextRight] - xval[i]) < (xval[i] - xval[left])) { + var nextLeft = science_stats_loessNextNonzero(weights, left); + bandwidthInterval[0] = nextLeft; + bandwidthInterval[1] = nextRight; + } +} + +function science_stats_loessNextNonzero(weights, i) { + var j = i + 1; + while (j < weights.length && weights[j] === 0) j++; + return j; +} +// Welford's algorithm. +science.stats.mean = function(x) { + var n = x.length; + if (n === 0) return NaN; + var m = 0, + i = -1; + while (++i < n) m += (x[i] - m) / (i + 1); + return m; +}; +science.stats.median = function(x) { + return science.stats.quantiles(x, [.5])[0]; +}; +science.stats.mode = function(x) { + x = x.slice().sort(science.ascending); + var mode, + n = x.length, + i = -1, + l = i, + last = null, + max = 0, + tmp, + v; + while (++i < n) { + if ((v = x[i]) !== last) { + if ((tmp = i - l) > max) { + max = tmp; + mode = last; + } + last = v; + l = i; + } + } + return mode; +}; +// Uses R's quantile algorithm type=7. +science.stats.quantiles = function(d, quantiles) { + d = d.slice().sort(science.ascending); + var n_1 = d.length - 1; + return quantiles.map(function(q) { + if (q === 0) return d[0]; + else if (q === 1) return d[n_1]; + + var index = 1 + q * n_1, + lo = Math.floor(index), + h = index - lo, + a = d[lo - 1]; + + return h === 0 ? a : a + h * (d[lo] - a); + }); +}; +// Unbiased estimate of a sample's variance. +// Also known as the sample variance, where the denominator is n - 1. +science.stats.variance = function(x) { + var n = x.length; + if (n < 1) return NaN; + if (n === 1) return 0; + var mean = science.stats.mean(x), + i = -1, + s = 0; + while (++i < n) { + var v = x[i] - mean; + s += v * v; + } + return s / (n - 1); +}; +})() \ No newline at end of file