;function to detect type of error array input
function errtype, err, bad_err_msg
sz = size(err)
  case sz[0] of
    0: errtype = 'sigma'
    1: errtype = 'sigmas'
    3: errtype = 'pdfs'
    else: message,bad_err_msg
  endcase
return,errtype
end

;function to check for consistent error array input
pro vet_err, err, errtype, n, bad_err_msg
  sz = size(err)
  
  badinput = 0 ;turn this switch on if input is bad
  ;check that dimensions are good
  ;if errtype eq 'sigma' -- no action needed for scalar
  if errtype eq 'sigmas' and sz[1] ne n then badinput = 1
  if errtype eq 'pdfs' and (sz[1] ne n or sz[2] ne 2) then badinput = 1
  
  ;print error if bad dimensions
  if badinput then message,bad_err_msg
end

;function to generate simulated data based on values and error array
function generate_data, v, err, type, n, nsim, dbl, seed
  r = type eq 'pdfs' ? randomU(seed, n, nsim, double=dbl) : randomN(seed, n, nsim, double=dbl)
  case type of
    ;v # replicate(1,n) uses matrix multiplication to create an array where the
    ;nth column is filled with v[n]
    'sigma': simdata = r*err + (v # replicate(1,nsim))
    'sigmas': simdata = r*(err # replicate(1,nsim)) + (v # replicate(1,nsim))
    'pdfs': begin
      simdata = dbl ? dblarr(n, nsim) : fltarr(n, nsim)
      for i = 0,n-1 do begin
        pdfx = err[i,0,*]
        pdfy = err[i,1,*]
        
        ;first compute the cdf from the pdf using trapezoidal integration
        trapezoid_areas = 0.5*(pdfy[1:-1] + pdfy[0:-2])*(pdfx[1:-1] - pdfx[0:-2])
        f = TOTAL(trapezoid_areas,/CUMULATIVE)
        f = f/f[-1] ;ensure it is normalized
        
        ;modify x vector have one pt centered at each trapezoidal element
        pdfx = (pdfx[1:-1] + pdfx[0:-2])/2.
        
        ;transform uniform to input distribution via interpolation from the cdf
        simdata[i,*] = INTERPOL(pdfx, f, r[i,*])
      endfor
    end
  endcase
  return,simdata
end

;;;;; THE MAIN FUNCTION ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
function safe_correlate, x, y, xerr, yerr, nsim=nsim, seed=seed
;+
;
; NAME:
;       SAFE_CORRELATE
;
; PURPOSE:
;       This function computes the probability by which the null hypothesis of 
;       uncorrelated data may be rejected while accounting for uncertainty in 
;       the data values.
;
; EXPLANATION:
;       This function generates NSIM simulated X,Y datasets based on the
;       provided points and their erros. These are then used to compute
;       the probability that uncorrelated data could explain the arrangement
;       of the points, the probability-to-exceed or PTE, using Spearman's rank 
;       correlation test. Each simulated dataset is assigned a probability of 
;       1/NSIM of occuring. Thus, for a given dataset, the probability that the 
;       true data (given the uncertainties) are arranged as simulated AND
;       that this particular arrangment of data can be explained without an 
;       underlying correlation is PTE/NSIM. These values are summed to compute 
;       the overall probability that the data represent an uncorrelated 
;       arrangement of points (in other words, the p-value or PTE for the null 
;       hypothesis of uncorrelated data).
;
;       A tutorial on SAFE_CORRELATE is available at 
;       http://parkeloyd.com/output/code/safe_correlate/
;
; CALLING SEQUENCE:
;       Result = SAFE_CORRELATE(X, Y, XERR, YERR, [NSIM=1e4, SEED=SEED])
;
; INPUTS:
;       X,Y:       N-element vectors of the data points. These are ignored if 
;                  PDF input is supplied for X or Y (see below).
;       
;       XERR,YERR: The data point errors. These may be supplied as a scalar, 
;                  N-element vector, 2xM array, or Nx2xM array.
;                  scalar: The identical Gaussian 1-sigma error for all 
;                          points.
;                  N vector: The Gaussian 1-sigma error for each respective 
;                            point.
;                  Nx2xM array: M points sampling the probability distribution 
;                               function (PDF) for each data point. The values 
;                               are contained in [N,0,*] and probability 
;                               densities in [N,1,*]. This is useful for 
;                               non-Gaussian errors, especially upper limits.
;
; KEYWORD PARAMETERS:
;       NSIM: The number of X,Y datasets to simulate. Default = 1e4.
;       SEED: Random number seed for use with RANDOMN and RANDOMU. Useful for 
;             ensuring reproducible results. Can either be an input value or 
;             a variable into which the used value will be stored.
;
; EXAMPLES:
;       Data with identical errors:
;         xerr = 2.0
;         yerr = 3.0
;          
;         ;generate linear data with errors
;         N = 10
;         x = findgen(N) + randomn(seed,N)*xerr
;         y = findgen(N) + randomn(seed,N)*yerr
;          
;         ;plot
;         ep = errorplot(x,y,replicate(xerr,N),replicate(yerr,N),'o')
;          
;         ;corrrelate
;         print,safe_correlate(x,y,xerr,yerr)
;          
;       Data with differing errors, 5e3 simulations:
;         ;generate nonuniform errors
;         N = 10
;         xerr = randomu(seed,N) + 1.0
;         yerr = randomu(seed,N)*1.5 + 1.0
;         
;         ;generate linear data with errors
;         x = findgen(N) + randomn(seed,N)*xerr
;         y = findgen(N) + randomn(seed,N)*yerr
;         
;         ;plot
;         ep = errorplot(x,y,xerr,yerr,'o')
;         
;         ;correlate
;         print,safe_correlate(x,y,xerr,yerr,nsim=5e3)
;         
;       Data with non-gaussian errors
;         ;generate linear data with some scatter
;         N = 10
;         x = findgen(N) + 5 + 2*randomn(seed,N)
;         y = findgen(N) + 5 + 3*randomn(seed,N)
;         
;         ;assign uniform pdfs to the x data and gamma distributions to the
;         ;y data (just for example, since the data were actaully generated
;         ;from a Gaussian PDF)
;         ;note that the PDFs do not have to be normalized
;         M = 1000 ;number of points sampling pdfs
;         xerr = fltarr(N,2,M)
;         yerr = fltarr(N,2,M)
;         t = 0.7 ;gamma distribution scale parameter
;         for i = 0,N-1 do begin &$
;           xvalues = findgen(M)/(M-1) + x[i] - 0.5 &$ ;width = 1.0
;           xprobs = replicate(1.0, M) &$
;           xerr[i,0,*] = xvalues &$
;           xerr[i,1,*] = xprobs &$
;           yvalues = findgen(M)/(M-1)*y[i]*2.0 &$
;           k = y[i]/t + 1 &$
;           yprobs = yvalues^(k-1)*exp(-yvalues/t)/t^k/gamma(k) &$
;           yerr[i,0,*] = yvalues &$
;           yerr[i,1,*] = yprobs &$
;         endfor
;         
;         ;correlate
;         print,safe_correlate(x,y,xerr,yerr)
;
; REFERENCE:
;       See Numerical Recipes by Press et al. for information on the 
;       Spearman Rank correlation test.
;
; MODIFICATION HISTORY:
;       Written by:  R. O. Parke Loyd, 2014-07
;-

;;;;; GROOM AND VET THE INPUT ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

bad_err_msg = 'Bad input shape for xerr or yerr, see code header.'

;determine type of error array supplied (sigma, sigmas, pdfs)
xerrtype = errtype(xerr, bad_err_msg)
yerrtype = errtype(yerr, bad_err_msg)

;check if x and y are going to be used and, if so, make sure they have the same
;length
if xerrtype eq 'pdfs' then begin
  temp = size(xerr)
  n = temp[1]
endif else begin
  if yerrtype eq 'pdfs' then begin
    temp = size(yerr)
    n = temp[1]
  endif else begin
    n = n_elements(x)
    if n ne n_elements(y) then begin
      message, 'The x and y vectors must have the same number of points.'
    endif
  endelse
endelse

;check that error input is good and determine its type
vet_err,xerr,xerrtype,n,bad_err_msg
vet_err,yerr,yerrtype,n,bad_err_msg

;record whether double precision is used
dbl = isa(x,'double') or isa(y,'double')

;set default number of simulations
if ~keyword_set(nsim) then nsim = 1e4

;;;;; GENERATE SIMULATED DATA ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
xsim = generate_data(x, xerr, xerrtype, n, nsim, dbl, seed)
ysim = generate_data(y, yerr, yerrtype, n, nsim, dbl, seed)

;;;;; COMPUTE PROBABILITY TO EXCEED FOR NULL HYPOTHESIS ;;;;;;;;;;;;;;;;;;;;;;; 

pte = 0.0d
for i = 0,nsim-1 do begin
  result = r_correlate(xsim[*,i], ysim[*,i])
  pte += result[1]
endfor
pte = pte/nsim

return,pte

end