wsatools.Statistics

1 #------------------------------------------------------------------------------ 2 #$Id: Statistics.py 9998 2013-08-09 08:44:31Z NicholasCross $ 3 """ 4 Statistics package for photometry. 5 6 @author: N.J.G. Cross 7 @org: WFAU, IfA, University of Edinburgh 8 9 @requires: Numpy, math, Scientific 10 """ 11 #------------------------------------------------------------------------------ 12 import math 13 import numpy 14 from Scientific.Functions import LeastSquares as LSQ 15 16 import wsatools.DbConnect.DbConstants as dbc 17 #------------------------------------------------------------------------------ 18

19 -def biweight(xx, cc):

20 """Calculates the biweight - Beers et al. 1990. 21 cc is the tuning constant -- 6.0 (approx 4sigma) is good for the location. 22 """ 23 mui = biweightmui(xx, cc) 24 med = numpy.median(xx) 25 test = numpy.multiply(numpy.less(numpy.abs(mui), 1.), 1.) 26 denom = numpy.sum(numpy.multiply(numpy.multiply(numpy.subtract(xx, med), numpy.power(numpy.subtract(1., numpy.power(mui, 2.)), 2.)), test)) 27 numer = numpy.sum(numpy.multiply(numpy.power(numpy.subtract(1., numpy.power(mui, 2.)), 2.), test)) 28 if abs(numer) > 0: 29 rat = denom / numer 30 return med + rat 31 else: 32 return dbc.realDefault()

33 34 #------------------------------------------------------------------------------ 35

36 -def biweightmui(xx, cc):

37 """ Calculates the biweight mui - Beers et al. 1990. 38 """ 39 med, MD = MAD(xx) 40 #@TODO: add in numpy.where 41 return numpy.divide(numpy.subtract(xx, med), cc * MD)

42 43 #------------------------------------------------------------------------------ 44

45 -def biweightScale(xx, cc):

46 """Calculates the biweight scale - Beers et al. 1990. 47 cc is the tuning constant -- 9.0 is good for the scale. 48 """ 49 mui = biweightmui(xx, cc) 50 med = numpy.median(xx) 51 test = numpy.multiply(numpy.less(numpy.abs(mui), 1.), 1.) 52 denom = numpy.sum(numpy.multiply(numpy.multiply(numpy.power(numpy.subtract(xx, med), 2.), numpy.power(numpy.subtract(1., numpy.power(mui, 2.)), 4.)), test)) 53 numer = numpy.sum(numpy.multiply(numpy.multiply(numpy.subtract(1., numpy.power(mui, 2.)), numpy.subtract(1., numpy.multiply(5., numpy.power(mui, 2.)))), test)) 54 if abs(numer) > 0.: 55 rat = numpy.sqrt(denom) / abs(numer) 56 return numpy.sqrt(1.0 * numpy(xx).size) * rat 57 else: 58 return dbc.realDefault()

59 60 #------------------------------------------------------------------------------ 61

62 -def calcMean(xx):

63 """This calculates the mean 64 """ 65 if xx.shape[0] > 1: 66 mean = numpy.array(numpy.mean(xx, axis=0)) 67 else: 68 mean = dbc.realDefault() 69 return mean

70 71 #------------------------------------------------------------------------------ 72

73 -def calcMAD(xx, minSd):

74 """ 75 """ 76 if xx.shape[0] > 1: 77 med, mad = MAD(xx, minSd) 78 else: 79 med, mad = dbc.realDefault(), dbc.realDefault() 80 return med, mad

81 82 #------------------------------------------------------------------------------ 83

84 -def calcSd(xx):

85 if xx.shape[0] > 1: 86 # @TODO: add in numpy.where 87 sd = numpy.multiply(numpy.std(xx, axis=0), numpy.sqrt(numpy.divide(\ 88 xx.shape[-1], numpy.subtract(xx.shape[-1], 1.)))) 89 else: 90 sd = dbc.realDefault() 91 return sd

92 93 #------------------------------------------------------------------------------ 94

95 -def calcSkew(xx):

96 if xx.shape[0] > 2: 97 denom = numpy.multiply(numpy.multiply( 98 numpy.subtract(xx.shape[-1], 1.), 99 numpy.subtract(xx.shape[-1], 2.)), numpy.power(calcSd(xx), 3.)) 100 101 skew = numpy.where(denom > 0, 102 numpy.divide(numpy.multiply(numpy.power(xx.shape[-1], 2.), 103 centralMoment(xx, 3)), denom), 104 dbc.realDefault()) 105 else: 106 skew = dbc.realDefault() 107 return skew

108 109 #------------------------------------------------------------------------------ 110

111 -def centralMoment(data, N):

112 data = numpy.array(data) 113 if data.shape[0] > 0: 114 return numpy.mean(numpy.power(numpy.subtract(data, numpy.mean(data, 115 axis=0)), 116 float(N)), axis=0) 117 else: 118 return dbc.realDefault()

119 120 #------------------------------------------------------------------------------ 121

122 -def chiSq(mag, rms, model, nParam):

123 """This returns the reduced chiSquared and number of degrees of freedom. 124 """ 125 #@TODO: add in numpy.where 126 chisq = numpy.sum(numpy.power(numpy.divide(numpy.subtract(mag, model), rms), 2.)) 127 ndof = len(mag) - nParam 128 return chisq / float(ndof), ndof

129 130 #------------------------------------------------------------------------------ 131

132 -def chiSquare(model, parameters, data):

133 chisq = 0. 134 for point in data: 135 sigma = 1 136 if len(point) == 3: 137 sigma = point[2] 138 f = model(parameters, point[0]) 139 chisq = chisq + ((f - point[1]) / sigma) ** 2 140 return chisq

141 142 #------------------------------------------------------------------------------ 143

144 -def clippedHistogram(xx, clip=0, Niter=10, imean=dbc.realDefault(), 145 isd=dbc.realDefault()):

146 """ 147 Calculates mean and standard deviation of clipped distribution. 148 149 """ 150 xx = numpy.array(xx) 151 if xx.size < 2: 152 return dbc.realDefault(), dbc.realDefault() 153 154 if clip == 0: 155 mean = calcMean(xx) 156 sd = calcSd(xx) 157 else: 158 test = 0 159 if imean == dbc.realDefault() and isd == dbc.realDefault(): 160 mean = calcMean(xx) 161 sd = calcSd(xx) 162 else: 163 mean, sd = imean, isd 164 165 it = 0 166 while (test == 0) and it < Niter: 167 meanold, sdold = mean, sd 168 minV = mean - clip * sd 169 maxV = mean + clip * sd 170 xx_mask = maskArray(xx, minV, maxV) 171 mean = calcMean(xx_mask) 172 sd = calcSd(xx_mask) 173 xx_mask = [] 174 if mean == meanold and sd == sdold: 175 test = 1 176 it += 1 177 del xx 178 if mean == 0. and sd == 0.: 179 mean, sd = dbc.realDefault(), dbc.realDefault() 180 181 return mean, sd

182 183 #------------------------------------------------------------------------------ 184

185 -def clippedMADHistogram(xx, clip=0, Niter=10, imed=dbc.realDefault(), imad=dbc.realDefault(), retValues="median,psd", minSd=None):

186 """ 187 Calculates mean,median,standard deviation and median absolute deviation of 188 a clipped distribution. The distribution is clipped by a certain number of 189 median absolute deviations. 190 191 @param xx: The input list or array of values. 192 @type xx: list 193 @param clip: The number of 'standard deviations for clipping' 194 @type clip: float 195 @param Niter: The number iterations when clipping 196 @type Niter: int 197 @param imed: starting value for the median BEST NOT TO USE 198 @type imed: float 199 @param imad: starting value for the MAD BEST NOT TO USE 200 @type imad: float 201 @param retValues: string describing which values to return 202 @type retValues: str 203 204 @return: tuple of statistical values described in retValues 205 @rtype: tuple 206 207 """ 208 xx = numpy.array(xx) 209 dimen = len(xx.shape) 210 if clip != 0. and dimen > 1: 211 print ("The array dimension is too large for clipping. Either do ", 212 "not clip or use a 1-d array") 213 return -1 214 clip = clip * 1.48 215 # select data 216 if not minSd: 217 minSd = getMinSd(xx) 218 if clip == 0.: 219 if xx.size > 0: 220 med, mad = calcMAD(xx, minSd) 221 else: 222 if dimen == 1: 223 med, mad = dbc.realDefault(), dbc.realDefault() 224 else: 225 med = numpy.multiply(numpy.ones, (numpy.shape(xx)[:-1], 226 numpy.float32), 227 dbc.realDefault()) 228 mad = numpy.multiply(numpy.ones, (numpy.shape(xx)[:-1], 229 numpy.float32), 230 dbc.realDefault()) 231 xx_mask = xx 232 else: 233 test = 0 234 235 if imed == dbc.realDefault() and imad == dbc.realDefault() and xx.size > 0: 236 med, mad = calcMAD(xx, minSd) 237 else: 238 if dimen == 1: 239 med, mad = imed, imad 240 else: 241 med = numpy.multiply(numpy.ones, (numpy.shape(xx)[:-1], 242 numpy.float32), 243 imed) 244 mad = numpy.multiply(numpy.ones, (numpy.shape(xx)[:-1], 245 numpy.float32), 246 imad) 247 it = 0 248 while (test == 0) and it < Niter: 249 medold, madold = med, mad 250 minV = med - (clip * mad) 251 maxV = med + (clip * mad) 252 # Make sure all values are >0 apart from those outside range = 0. 253 # then remove all 254 xx_mask = maskArray(xx, minV, maxV) 255 if xx_mask.size > 0: 256 med, mad = calcMAD(xx_mask, minSd) 257 else: 258 med, mad = dbc.realDefault(), dbc.realDefault() 259 test = 1 260 if med == medold and mad == madold: 261 test = 1 262 263 it += 1 264 # calculate final attributes 265 med = numpy.where(numpy.logical_and(numpy.equal(med, 0.), 266 numpy.equal(mad, 0.)), 267 dbc.realDefault(), med) 268 mad = numpy.where(numpy.logical_and(numpy.equal(med, 0.), 269 numpy.equal(mad, 0.)), 270 dbc.realDefault(), mad) 271 psd = numpy.where(numpy.greater(mad, dbc.realDefault()), 272 numpy.multiply(mad, 1.48), 273 dbc.realDefault()) 274 returnValueList = retValues.split(',') 275 possValueList = ["median", "med", "mean", "mad", "psd", "stdev", "tot", "skew", "min", "max"] 276 valList = [] 277 for val in returnValueList: 278 if val.lower() not in possValueList: 279 print ("<Warning>Return value %s not recognised. Will return" 280 " default value" % val) 281 valList.append(dbc.realDefault()) 282 elif val.lower() == 'median' or val.lower() == 'med': 283 valList.append(med) 284 elif val.lower() == 'mean': 285 valList.append(calcMean(xx_mask)) 286 elif val.lower() == 'mad': 287 valList.append(mad) 288 elif val.lower() == 'psd': 289 valList.append(psd) 290 elif val.lower() == 'tot': 291 if dimen > 1: 292 valList.append(numpy.array(xx_mask.shape[-1])) 293 else: 294 valList.append(xx_mask.size) 295 elif val.lower() == 'stdev': 296 valList.append(calcSd(xx_mask)) 297 elif val.lower() == 'skew': 298 valList.append(calcSkew(xx_mask)) 299 elif val.lower() == 'max': 300 valList.append(numpy.max(xx_mask)) 301 elif val.lower() == 'min': 302 valList.append(numpy.min(xx_mask)) 303 304 return tuple(valList)

305 306 #------------------------------------------------------------------------------ 307

308 -def firstMomentDist(x, y):

309 """This calculates the mean value of x - sum(xy)/sum(y) 310 """ 311 numer, denom = 0., 0. 312 for i in range(len(x)): 313 numer += float(y[i]) 314 denom += float(x[i]) * float(y[i]) 315 if numer != 0.: 316 return (denom / numer) 317 else: 318 print "Zero Division Error" 319 return dbc.realDefault()

320 321 #------------------------------------------------------------------------------ 322

323 -def gammln(betaArray):

324 """ Calculates log(Gamma(beta)). 325 Lanczos approximation. Numerical Recipes. |e|<2e-10. 326 This works for any complex argument with a nonnegative real part 327 328 @param betaArray: Array of input values for which the gamma function. 329 @type betaArray: numpy array 330 @return: log(Gamma(beta). 331 @rtype: numpy array 332 """ 333 coeffs = [76.18009172947146, -86.5053203941677, 24.01409824083091, 334 - 1.231739572450155, 0.1208650973866179e-2, -0.5395239284953e-5] 335 xx, yy = betaArray, betaArray 336 zz = numpy.add(xx, 5.5) 337 gamma = numpy.subtract(numpy.multiply(numpy.add(xx, 0.5), numpy.log(numpy.where(zz > 0, zz, 1.))), zz) 338 summation = numpy.multiply(1.000000000190015, numpy.ones([betaArray.size], numpy.float64)) 339 for coeff in coeffs: 340 yy = numpy.add(yy, 1) 341 summation = numpy.where(numpy.greater(yy, 0.), 342 numpy.add(summation, numpy.divide(coeff, numpy.where(numpy.not_equal(yy, 0.), yy, 1.))), 0.) 343 344 return numpy.where(numpy.logical_and(numpy.greater(xx, 0.), numpy.greater(summation, 0.)), 345 numpy.add(gamma, numpy.log(numpy.divide(numpy.multiply(2.5066282746310005, summation), xx))), 346 0.)

347 348 #------------------------------------------------------------------------------ 349

350 -def getMinSd(xx):

351 """ Calculates minimum standard deviation for a distribution. 352 This is to avoid zero values in the calculation of the median 353 absolute deviation. This looks to see if the data is binned/rounded 354 and if not, it will calculate a minimum based on double precision values 355 356 @param xx: array of input values 357 @type xx: numpy array 358 @return: minimum standard deviation 359 @rtype: float 360 """ 361 if xx.size == 0: 362 return -1 363 if xx.size == 1: 364 return 0 365 if numpy.array(xx).dtype == int: 366 sf = 0 367 else: 368 sf = 0 369 test = False 370 distVals = list(numpy.unique(numpy.abs(xx))) 371 absMin = distVals[len(distVals) - 1] / 9.22e18 372 if absMin == 0.: 373 absMin = 1. / 9.22e18 374 posSFvals = [] 375 ratios = [] 376 while not test: 377 maxerr = numpy.max(numpy.abs(numpy.subtract(xx, numpy.round(xx, sf)))) 378 minerr = numpy.min(numpy.abs(numpy.subtract(xx, numpy.round(xx, sf)))) 379 if maxerr > 0: 380 rat = 10 ** (-sf) / maxerr 381 else: 382 rat = 10. 383 if ((10 ** (-sf)) < absMin): 384 posSFvals.append((sf, rat)) 385 ratios.append(rat) 386 test = True 387 elif (maxerr < 10 ** (-(sf + 1)) and minerr < 1.0E-323): 388 posSFvals.append((sf, rat)) 389 ratios.append(rat) 390 sf += 1 391 best = max(ratios) 392 for tsf, rat in posSFvals: 393 if best == rat: 394 sf = tsf 395 #else: 396 # print "Not integer or floating point type" 397 # return -1 398 399 vals = list(numpy.unique(numpy.round(xx, sf))) 400 valsCopy = vals 401 endTerm = valsCopy.pop(len(vals) - 1) 402 valsCopy.insert(0, endTerm) 403 deltas = numpy.unique(numpy.abs(numpy.subtract(valsCopy, vals))) 404 if deltas[0] > 0.: 405 minVal = deltas[0] 406 elif len(deltas) > 1: 407 minVal = deltas[1] 408 else: 409 minVal = 10 ** (-sf) 410 return minVal #/math.sqrt(float(len(xx)-1))

411 412 #------------------------------------------------------------------------------ 413

414 -def listCounts(inputList, minCount=None):

415 """ 416 This returns the sorted unique values in a list of objects and the number 417 of times this value is listed. Also selects objects with a minimum count. 418 419 @param inputList: The input list of values (each value can be 420 any python object). 421 @type inputList: list 422 @param minCount: The minimum number of occurances of a value for it to 423 be returned 424 @type minCount: int 425 426 @return: List of unique values and the counts for these values. 427 @rtype: list 428 429 """ 430 uniqueValues = sorted(set(inputList)) 431 intgArray = numpy.array([uniqueValues.index(item) for item in inputList]) 432 counts = numpy.bincount(intgArray) 433 if not minCount: 434 return zip(uniqueValues, counts) 435 else: 436 return [(value, count) for value, count in zip(uniqueValues, counts) 437 if count >= minCount]

438 439 #------------------------------------------------------------------------------ 440

441 -def MAD(xx, minSd=None):

442 """Median Absolute Deviation 443 """ 444 med = numpy.median(xx, 0) 445 absdev = numpy.abs(numpy.subtract(xx, med)) 446 mad = numpy.median(absdev, 0) 447 if not minSd: 448 minSd = getMinSd(xx) 449 mad = numpy.maximum(mad, numpy.multiply(numpy.ones(mad.shape, numpy.float32), 450 (minSd / 1.48))) 451 return med, mad

452 453 #------------------------------------------------------------------------------ 454

455 -def maskArray(xx, minV, maxV):

456 xxN = numpy.subtract(xx, (minV - 1.)) 457 xx_mask = numpy.sort(numpy.multiply(xxN, numpy.logical_and(numpy.greater(xx, minV), numpy.less(xx, maxV)))) 458 xxN = [] 459 nmin = xx.size - int(numpy.sum(numpy.sum(numpy.logical_and(numpy.greater(xx, minV), numpy.less(xx, maxV)).astype(numpy.float32)))) 460 nmax = xx.size 461 xx_mask = numpy.add(xx_mask[nmin:nmax], (minV - 1.)) 462 return xx_mask

463 464 #------------------------------------------------------------------------------ 465

466 -def polynomialModel(param, magArray):

467 """Uses numpy to calculate polynomial model 468 """ 469 outArray = numpy.zeros([magArray.size], numpy.float64) 470 for index, par in enumerate(param): 471 outArray = numpy.add(outArray, numpy.multiply(numpy.power(magArray, 472 index), par)) 473 return outArray

474 475 #------------------------------------------------------------------------------ 476

477 -def probChisq(chiSqArray, ndofArray, nFinalPoints=10000, maxValueY=5.):

478 """Calculates the probability that the REDUCED chi-squared of a model 479 is less than a particular value. This is calculated by integrating the 480 area under the chi-squared curve up to this value. Inputs an array of 481 REDUCED chi-squared values and an equivalent array of numbers of 482 degrees-of-freedom for these values. 483 484 @param chiSqArray: array of reduced chisq values 485 @type chiSqArray: numpy array 486 @param ndofArray: array of number of degree of freedom values 487 @type ndofArray: numpy array 488 @param nFinalPoints: number of points in lookup table for integration 489 @type nFinalPoints: int 490 @param maxValueY: maximum value to integrate to. The p(ChiSq)~1 here. 491 @type maxValueY: float 492 @return: array of probabilities 493 @rtype: numpy array 494 """ 495 496 #@@TODO: Large values of ndof - get infinities. Can approx by gaussian. 497 # Need to work out when this is necessary. 498 # first calculate lookup tables in memory 499 # @FIXME: small values of ndof - rounding errors 500 nSteps = 2 * (nFinalPoints + 1) 501 uniqueNdof = numpy.unique(ndofArray) 502 dy = maxValueY / nSteps 503 yVector = numpy.multiply(numpy.arange(1, nSteps + 1), dy) 504 yArray = numpy.multiply(numpy.transpose(numpy.multiply( 505 numpy.ones([uniqueNdof.size, nSteps], numpy.float64), yVector)), 506 uniqueNdof) 507 uniqueNdofMsk = numpy.where(numpy.greater(uniqueNdof, 0.), uniqueNdof, 1) 508 unNdofd2 = numpy.multiply(uniqueNdofMsk, 0.5) 509 constantTermArray = numpy.where(numpy.greater(uniqueNdof, 0.), 510 numpy.multiply(numpy.power(2, unNdofd2), numpy.exp(gammln(unNdofd2))), 511 0.) 512 diffArray = numpy.where(numpy.greater(uniqueNdof, 0.), 513 numpy.multiply(numpy.divide(numpy.multiply(numpy.power( 514 yArray, numpy.multiply(numpy.ones([nSteps, unNdofd2.size], 515 numpy.float64), numpy.subtract(unNdofd2, 1.))), numpy.exp( 516 numpy.multiply(yArray, -0.5))), constantTermArray), uniqueNdofMsk), 0.) 517 areaVector = numpy.zeros([uniqueNdof.size], numpy.float64) 518 # Use simpsons rule. 519 probArray = numpy.zeros([(nFinalPoints), uniqueNdof.size], numpy.float64) 520 pointDiff = 2 * dy 521 finalYvector = numpy.zeros([nFinalPoints], numpy.float64) 522 for point in range(nFinalPoints): 523 simpsonRuleSum = numpy.add(numpy.add(diffArray[2 * point], 524 diffArray[2 * point + 2]), numpy.multiply(diffArray[2 * point + 1], 4.)) 525 areaVector = numpy.add(areaVector, numpy.multiply(simpsonRuleSum, (pointDiff / 6.))) 526 finalYvector[point] = yVector[2 * point + 1] 527 probArray[point] = areaVector 528 529 # now find best indices to look up probability. interpolate between points 530 #finalYvector=numpy.multiply(numpy.arange(1,nFinalPoints+1),pointDiff) 531 minYV = min(finalYvector) 532 index = numpy.where(numpy.greater(pointDiff, 0.), numpy.divide(numpy.add(chiSqArray, minYV), pointDiff), 0) 533 index = numpy.where(numpy.less_equal(index, (finalYvector.size - 1)), index, (finalYvector.size - 1)) 534 upperIndex = numpy.minimum(numpy.ceil(index).astype(numpy.int32), (finalYvector.size - 1)) 535 lowerIndex = numpy.subtract(upperIndex, 1) 536 # indexArray for ndof ndofArray -- uniqueNdof 537 ndofIndexArray = numpy.zeros([ndofArray.size], numpy.int32) 538 for ii, ndof in enumerate(list(uniqueNdof)): 539 if ndof > 0: 540 ndofIndexArray = numpy.add(ndofIndexArray, numpy.multiply(numpy.equal( 541 ndofArray, ndof).astype(numpy.int32), ii)) 542 prob = numpy.nan_to_num(numpy.add(numpy.multiply(numpy.subtract(1, 543 numpy.subtract(index, lowerIndex)), probArray[lowerIndex, ndofIndexArray]), 544 numpy.multiply(numpy.subtract(1, numpy.subtract(upperIndex, index)), 545 probArray[upperIndex, ndofIndexArray]))) 546 prob = numpy.maximum(numpy.minimum(prob, 1.), 0.) 547 return numpy.where(numpy.greater(ndofArray, 0), prob, dbc.realDefault())

548 #------------------------------------------------------------------------------ 549

550 -def probGauss(sigmaArray, nFinalPoints=10000, maxValueY=5.):

551 """Calculates the probability that the abs(sigma) of a standard normal 552 distribution 553 is less than a particular value. This is calculated by integrating the 554 area under the gaussian [N(0,1)] curve up to this value. Inputs an array 555 of standard absolute sigma. 556 557 @param sigmaArray: array of abs(sigma) from a N(0,1) gaussian 558 @type sigmaArray: numpy array 559 @param nFinalPoints: number of points in lookup table for integration 560 @type nFinalPoints: int 561 @param maxValueY: maximum value to integrate to. The p(ChiSq)~1 here. 562 @type maxValueY: float 563 @return: array of probabilities 564 @rtype: numpy array 565 """ 566 567 # first calculate lookup table in memory 568 569 nSteps = 2 * (nFinalPoints + 1) 570 dy = maxValueY / nSteps 571 yVector = numpy.multiply(numpy.arange(1, nSteps + 1), dy) 572 constantTerm = 2. / math.sqrt(2 * math.pi) 573 diffArray = numpy.multiply(numpy.exp(numpy.multiply(numpy.multiply(yVector, yVector), -0.5)), constantTerm) 574 areaVector = numpy.zeros([1], numpy.float64) 575 # Use simpsons rule. 576 probArray = numpy.zeros([nFinalPoints], numpy.float64) 577 pointDiff = 2 * dy 578 finalYvector = numpy.zeros([nFinalPoints], numpy.float64) 579 for point in range(nFinalPoints): 580 simpsonRuleSum = numpy.add(numpy.add(diffArray[2 * point], diffArray[2 * point + 2]), numpy.multiply(diffArray[2 * point + 1], 4.)) 581 areaVector = numpy.add(areaVector, numpy.multiply(simpsonRuleSum, (pointDiff / 6.))) 582 finalYvector[point] = yVector[2 * point + 1] 583 probArray[point] = areaVector 584 # now find best indices to look up probability. interpolate between points 585 586 minYV = min(finalYvector) 587 # @TODO: add in numpy.where 588 index = numpy.divide(numpy.add(sigmaArray, minYV), pointDiff) 589 upperIndex = numpy.minimum(numpy.ceil(index).astype(numpy.int32), (finalYvector.size - 1)) 590 lowerIndex = numpy.subtract(upperIndex, 1) 591 return numpy.add(numpy.multiply(numpy.subtract(1, numpy.subtract(index, lowerIndex)), probArray[lowerIndex]), numpy.multiply(numpy.subtract(1, numpy.subtract(upperIndex, index)), probArray[upperIndex]))

592 593 #------------------------------------------------------------------------------ 594

595 -def Pearson(X, Y):

596 """Calculates the Pearson Correlation Coefficient.""" 597 X = numpy.array(X) 598 Y = numpy.array(Y) 599 N = float(len(X)) 600 sx = numpy.sum(X) 601 sy = numpy.sum(Y) 602 sx2 = numpy.sum(X ** 2) 603 sy2 = numpy.sum(Y ** 2) 604 sxy = numpy.sum(X * Y) 605 top = sxy - (sx * sy / N) 606 bot1 = sx2 - (sx * sx / N) 607 bot2 = sy2 - (sy * sy / N) 608 return top / math.sqrt(bot1 * bot2)

609 610 #------------------------------------------------------------------------------ 611

612 -def scaleError(diffXY, dataS, diffXYErr=[1.], rangeMin=None, rangeMax=None):

613 """This function divides the data into several bins and finds 614 whether there is a gradient in the data. 615 """ 616 if rangeMin is None: 617 rangeMin = min(dataS) 618 619 if rangeMax is None: 620 rangeMax = max(dataS) 621 622 totalData = len(dataS) 623 if not totalData: 624 return dbc.realDefault(), dbc.realDefault(), dbc.realDefault() 625 626 data = [] 627 if len(diffXYErr) == 1 and len(dataS) > 1: 628 for ii in range(totalData): 629 if dataS[ii] >= rangeMin and dataS[ii] <= rangeMax: 630 data.append((dataS[ii], diffXY[ii])) 631 elif len(diffXYErr) == len(dataS): 632 for ii in range(totalData): 633 if dataS[ii] >= rangeMin and dataS[ii] <= rangeMax: 634 data.append((dataS[ii], diffXY[ii], diffXYErr[ii])) 635 data.sort() 636 totalData = len(data) 637 if totalData < 10: 638 return dbc.realDefault(), dbc.realDefault(), dbc.realDefault() # Not enough data 639 elif totalData < 500: 640 noBin = 5 641 noPerBin = int(totalData / 5.) 642 elif totalData < 5000: 643 noBin = 10 644 noPerBin = int(totalData / 10.) 645 else: 646 noBin = int(totalData / 500.) 647 noPerBin = int(totalData / noBin) 648 649 lineData = [] 650 if totalData >= 100: 651 for bb in range(noBin): 652 XYdata = [] 653 Sdata = [] 654 for gal in range(noPerBin): 655 gg = bb * noPerBin + gal 656 Sdata.append(data[gg][0]) 657 XYdata.append(data[gg][1]) 658 medS, _psdS = clippedMADHistogram(Sdata) 659 medXY, psdXY = clippedMADHistogram(XYdata) 660 lineData.append((medS, medXY, psdXY)) 661 else: 662 if len(diffXYErr) == 1: 663 for gal in range(totalData): 664 lineData.append((float(data[gal][0]), float(data[gal][1]), 1.)) 665 else: 666 for gal in range(totalData): 667 lineData.append((float(data[gal][0]), float(data[gal][1]), 668 float(data[gal][2]))) 669 670 expPar = [0., 0.] 671 bestFit = LSQ.polynomialLeastSquaresFit(expPar, lineData) 672 673 # find SD with scale error removed. 674 expData = numpy.add(numpy.multiply(bestFit[0][1], dataS), bestFit[0][0]) 675 diffXYexp = numpy.subtract(diffXY, expData) 676 _finalMed, finalPSTD = clippedMADHistogram(diffXYexp) 677 678 return bestFit[0][0], bestFit[0][1], finalPSTD

679 680 #------------------------------------------------------------------------------ 681

682 -def weightedMean(xx, sig):

683 """ ? 684 """ 685 if xx.size != sig.size: 686 print("Inconsistent shapes in numpy arrays for weightedMean. x array is" 687 " different to sigma array") 688 689 if xx.size < 1 or xx.size != sig.size: 690 return dbc.realDefault(), dbc.realDefault() 691 692 weight = numpy.where(numpy.greater(sig, 0.), (1 / sig) ** 2, 0.) 693 sumweight = numpy.sum(weight) 694 if not sumweight: 695 return dbc.realDefault(), dbc.realDefault() 696 697 sumweightdata = numpy.sum(xx * weight) 698 weightmean = sumweightdata / sumweight 699 if xx.size < 2: 700 return weightmean, dbc.realDefault() 701 702 sumweightdatammeansq = numpy.sum((xx - weightmean) ** 2 * weight) 703 # @FIXME: divide by zero? 704 # @FIXME: generally incorrect? 705 weightsamplevar = sumweight / (sumweight ** 2 - numpy.sum(weight ** 2)) 706 weightsamplevar *= sumweightdatammeansq 707 weightsamplesd = (math.sqrt(weightsamplevar) if weightsamplevar > 0 else 708 dbc.realDefault()) 709 710 return weightmean, weightsamplesd

711 712 #------------------------------------------------------------------------------ 713

714 -def clippedWeightedMean(xx, sig, clip, Niter=5):

715 xx = numpy.array(xx) 716 sig = numpy.array(sig) 717 if xx.size > 1: 718 mean, sd = weightedMean(xx, sig) 719 elif xx.size == 1: 720 mean, sd = xx[0], sig[0] 721 else: 722 mean, sd = dbc.realDefault(), dbc.realDefault() 723 if clip == 0: 724 return mean, sd 725 else: 726 meanold = mean 727 sdold = sd 728 test = 0 729 it = 0 730 while (test == 0) and it < Niter: 731 minV = mean - clip * sd 732 maxV = mean + clip * sd 733 # Make sure all values are >0 apart from those outside range = 0. 734 # then remove all 735 xx_mask = [] 736 sig_mask = [] 737 for ii in range(xx.size): 738 if xx[ii] >= minV and xx[ii] < maxV: 739 xx_mask.append(xx[ii]) 740 sig_mask.append(sig[ii]) 741 xx_mask = numpy.array(xx_mask) 742 sig_mask = numpy.array(sig_mask) 743 if xx_mask.size > 1: 744 mean, sd = weightedMean(xx_mask, sig_mask) 745 elif xx_mask.size == 1: 746 mean, sd = xx_mask[0], sig_mask[0] 747 else: 748 mean, sd = dbc.realDefault(), dbc.realDefault() 749 if sd < 0: 750 test = 1 751 if mean == meanold and sd == sdold: 752 test = 1 753 it += 1 754 return mean, sd

755 756 #------------------------------------------------------------------------------ 757

758 -def welchStetson(aMag, aSigma, bMag, bSigma):

759 """ 760 Calculates the Welch-Stetson statistic (1993, AJ, 105, 1813). The amount of 761 correlation in the variability between two bandpasses. 762 763 @param aMag: array of ? 764 @type aMag: numpy.array 765 @param aSigma: ? 766 @type aSigma: float 767 @param bMag: array of ? 768 @type bMag: numpy.array 769 @param bSigma: ? 770 @type bSigma: float 771 772 @return: Welch-Stetson statistic. 773 @rtype: float 774 775 """ 776 if aMag.size != bMag.size: 777 return dbc.realDefault() 778 const = math.sqrt(aMag.size * float(aMag.size - 1.)) 779 aWeightMean, _aWeightSigma = weightedMean(aMag, aSigma) 780 bWeightMean, _bWeightSigma = weightedMean(bMag, bSigma) 781 #@TODO: add in numpy.where 782 delaArray = numpy.divide(numpy.subtract(aMag, aWeightMean), aSigma) 783 delbArray = numpy.divide(numpy.subtract(bMag, bWeightMean), bSigma) 784 return numpy.nan_to_num(numpy.divide(numpy.sum(numpy.multiply(delaArray, delbArray)), const))

785 786 #------------------------------------------------------------------------------ 787 # Change log: 788 # 789 # 2-Nov-2005, NJC: Commenced file 790 # 2-Nov-2005, NJC: Adding scaleError function 791 # 3-Nov-2005, NJC: Finished scaleError function 792 # 24-Nov-2005, NJC: Edited scaleError for case of no data, 793 # imported Utilities and LeastSquares as LSQ 794 # 25-Nov-2005, NJC: Removed print statements 795 # 25-Nov-2005, NJC: Minor correction 796 # 5-Mar-2006, NJC: Revised clippedHistogram to make it faster on large 797 # arrays. 798 # 6-Mar-2006, NJC: Revised clippedMADHistogram. Added function to produce 799 # masked array 800 # 10-Mar-2006, NJC: Fixed memory leaks, made clippedHistogram and 801 # clippedMADHistogram faster and more reliable 802 # 4-Apr-2006, NJC: Fixed bugs in scaleError and added more option. 803 # 26-May-2006, NJC: Revised clippedHistogram and clippedMADHistogram to 804 # have an optional no of iterations. 805 # 7-Jun-2006, RSC: Replaced utils.sortList() with fast, new Python 2.4 method 806 # 21-Aug-2006, NJC: Fixed bug to remove rounding error. 807 # 17-Apr-2007, NJC: Fixed default values. 808 # 23-Apr-2007, NJC: Set Niter default values to 10. Set clip in clippedMAD 809 # to 1.48times input to make it like sd. 810 # 07-Jun-2007, NJC: Uses numpy and numpy. Works, but need to update 811 # all code to use numpy only eventually. Problem with 812 # scientific. 813 # 13-Jun-2007, NJC: Moved towards numpy only - still need to do a lot of 814 # checks though. 815 # 13-Jun-2007, NJC: Replaced median with numpy.median 816 # 15-Aug-2007, NJC: Fixed clippedHistogram - it tries to convert 817 # a numpy object to a numpy object. 818 # 24-Aug-2007, NJC: Using robust statistics in scaleError (MAD not SD) 819 # 30-Aug-2007, NJC: Fully replaced numarray with numpy. Corrected 820 # clippedMADHistogram to be fully median absolute deviation. 821 # This now has options - mean of the final distribution 822 # and a pseudo atandard deviation - 1.48 times mean. 823 # Default output is median and pseudo standard deviation. 824 # More options to follow. 825 # 6-Sep-2007, NJC: Added getMinSd function. This calculates the minimum value 826 # that standard deviation should have based on the accuracy 827 # of the data or the intervals between points. This 828 # is used to stop the MAD being evaluated as zero. 829 # 18-Sep-2007, NJC: Fixed bug in getMinSd. Added centralMoment function and 830 # now clippedMADHistogram gives optional output "tot". 831 # 22-Sep-2007, NJC: Added chiSq function 832 # 05-Oct-2007, NJC: Fixed bug in getMinSd 833 # 05-Nov-2007, NJC: Fixed bug in getMinSd 834 # 08-Nov-2007, NJC: Edited clippedMADHistogram to calculate getMinSd at 835 # the start. Moved functions so that they are in 836 # alphabetical order. Documented some functions. 837 # Added new functions gammln, probGauss,probChisq. 838 # 09-Nov-2007, NJC: Fixed integer related bug in getMinSd 839 # 15-Nov-2007, NJC: Converted weightedMean calculation to one which uses 840 # numpy. Use two modules, clippedWeightedMean and 841 # weighted mean to avoid repeats. Fixed error in weightMean 842 # Added in better error handling. 843 # 23-Nov-2007, NJC: Fixed weightedMean to avoid rounding error. 844 # 29-Nov-2007, NJC: Made corrections to clippedMADHistogram and getMinSB to 845 # correctly handle zero length arrays. 846 # 14-Feb-2008, NJC: Added listCounts() function that makes use of fast 847 # numpy.bincount() to evaluate the number of times each 848 # value occurs in a list. Value can be anything: integer, 849 # tuple,list,etc. 850 # 18-Feb-2008, NJC: Changed Statistics.clippedMADHistogram so that it can 851 # take 2-D arrays in some modes. Added calculation of skew. 852 # 19-Feb-2008, NJC: Edited calcMAD,calcMean,calcSd,calcSkew so that they 853 # use a minimum value of the first axis rather than minimum 854 # array size. Also set minimum for skew to be >2 rather 855 # than >1. 856 # 21-Feb-2008, NJC: Fixed bug in xx.shape calls for 2-D arrays. 857 # NJC: Fixed order of med,mad in calcMAD. Improved indexing in 858 # probChisq 859 # 6-Jul-2008, NJC: Made sure that return values works with upper and lower 860 # cases. Also returns min and max. 861 # 28-Jul-2008, NJC: Added in welchStetson statistic function. 862

Source Code for Module wsatools.Statistics