diff options
| author | tlatorre <tlatorre@uchicago.edu> | 2020-06-29 07:52:43 -0500 | 
|---|---|---|
| committer | tlatorre <tlatorre@uchicago.edu> | 2020-06-29 07:52:43 -0500 | 
| commit | 21420a805f214203819dd19ffc45fca9451a10bd (patch) | |
| tree | 864b651123c9c0347b763fc501992a50f497dbe6 | |
| parent | 83a96dec3aec3d794dcf9b2d566d0cfb6547a0f5 (diff) | |
| download | sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.gz sddm-21420a805f214203819dd19ffc45fca9451a10bd.tar.bz2 sddm-21420a805f214203819dd19ffc45fca9451a10bd.zip | |
add energy scale to chi2 fit
| -rwxr-xr-x | utils/chi2 | 132 | 
1 files changed, 102 insertions, 30 deletions
| @@ -33,9 +33,14 @@ from scipy.stats import iqr, norm, beta  from scipy.special import spence  from itertools import izip_longest +# Uncertainty on the energy scale +# FIXME: Should get real number from stopping muons +ENERGY_SCALE_MEAN = 1.0 +ENERGY_SCALE_UNCERTAINTY = 0.1 +  particle_id = {20: 'e', 22: r'\mu'} -def plot_hist2(df, muons=False, norm=1.0, color=None): +def plot_hist2(df, norm=1.0, scale=1.0, color=None):      for id, df_id in sorted(df.groupby('id')):          if id == 20:              plt.subplot(2,3,1) @@ -48,14 +53,10 @@ def plot_hist2(df, muons=False, norm=1.0, color=None):          elif id == 2222:              plt.subplot(2,3,6) -        if muons: -            plt.hist(np.log10(df_id.ke.values/1000), bins=np.linspace(0,4.5,100), histtype='step') -            plt.xlabel("log10(Energy (GeV))") -        else: -            bins = np.logspace(np.log10(20),np.log10(10e3),21) -            plt.hist(df_id.ke.values, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color) -            plt.gca().set_xscale("log") -            plt.xlabel("Energy (MeV)") +        bins = np.logspace(np.log10(20),np.log10(10e3),21) +        plt.hist(df_id.ke.values*scale, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color) +        plt.gca().set_xscale("log") +        plt.xlabel("Energy (MeV)")          plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$')      if len(df): @@ -93,8 +94,19 @@ def plot_hist(df, muons=False):      if len(df):          plt.tight_layout() -def make_nll(data_hists, mc_hists): +def make_nll(data, mc_hists):      def nll(x, grad=None): +        if x[0] < 0 or x[1] < 0: +            return np.inf + +        data_hists = {} +        for id in (20,22,2020,2022,2222): +            df_id = data[data.id == id] +            if len(df_id): +                data_hists[id] = np.histogram(df_id.ke.values*x[1],bins=bins)[0] +            else: +                data_hists[id] = np.zeros(len(bins)-1,dtype=np.int) +          nll = 0          for id in data_hists:              N = data_hists[id].sum() @@ -105,24 +117,53 @@ def make_nll(data_hists, mc_hists):                  p += 1e-10                  p /= p.sum()                  nll -= multinomial.logpmf(data_hists[id],N,p) -        return nll +        return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY)      return nll  def chi2(samples,expected):      return np.sum((samples-expected)**2/expected,axis=-1) -def get_multinomial_prob(data, mc, size=1000): -    N = mc.sum() -    # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). -    mc = mc + 1e-10 -    p = mc/mc.sum() -    chi2_data = chi2(data,mc) +def get_mc_hist(data,x,bins): +    if len(data): +        return np.histogram(data,bins=bins)[0]*x[0]/100.0 +    else: +        return np.zeros(len(bins)-1,dtype=np.int) + +def get_data_hist(data,x,bins): +    if len(data): +        return np.histogram(data*x[1],bins=bins)[0] +    else: +        return np.zeros(len(bins)-1,dtype=np.int) + +def get_multinomial_prob(data, data_mc, x_samples, bins, size=10000): +    """ +    Returns the p-value that the histogram of the data is drawn from the MC +    histogram. + +    Arguments: + +        data: 1D array of KE values +        data_mc: 1D array of MC KE values +        x_samples: MCMC samples of the floated parameters in the fit +        bins: bins used to bin the mc histogram +        size: number of values to compute +    """ +    chi2_data = []      chi2_samples = []      for i in range(size): +        x = x_samples[np.random.randint(x_samples.shape[0])] +        mc = get_mc_hist(data_mc,x,bins) +        N = mc.sum() +        # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think). +        mc = mc + 1e-10 +        p = mc/mc.sum() +        data_hist = get_data_hist(data,x,bins) +        chi2_data.append(chi2(data_hist,mc))          n = np.random.poisson(N)          samples = multinomial.rvs(n,p)          chi2_samples.append(chi2(samples,mc))      chi2_samples = np.array(chi2_samples) +    chi2_data = np.array(chi2_data)      return np.count_nonzero(chi2_samples >= chi2_data)/len(chi2_samples)  if __name__ == '__main__': @@ -135,12 +176,14 @@ if __name__ == '__main__':      from sddm.plot import *      from sddm import setup_matplotlib      import nlopt +    import emcee      parser = argparse.ArgumentParser("plot fit results")      parser.add_argument("filenames", nargs='+', help="input files")      parser.add_argument("--save", action='store_true', default=False, help="save corner plots for backgrounds")      parser.add_argument("--mc", nargs='+', required=True, help="atmospheric MC files")      parser.add_argument("--nhit-thresh", type=int, default=None, help="nhit threshold to apply to events before processing (should only be used for testing to speed things up)") +    parser.add_argument("--steps", type=int, default=1000, help="number of steps in the MCMC chain")      args = parser.parse_args()      setup_matplotlib(args.save) @@ -210,7 +253,7 @@ if __name__ == '__main__':      for id in (20,22,2020,2022,2222):          df_id = mc[mc.id == id]          if len(df_id): -            mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] +            mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100          else:              mc_hists[id] = np.zeros(len(bins)-1,dtype=np.int) @@ -226,34 +269,63 @@ if __name__ == '__main__':      for id in (20,22,2020,2022,2222):          df_id = atm_mc[atm_mc.id == id]          if len(df_id): -            mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] +            mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100          else:              mc_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int) -    nll = make_nll(data_hists,mc_hists) +    nll = make_nll(data,mc_hists) -    x0 = np.array([1.0]) +    x0 = np.array([1.0,1.0])      opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))      opt.set_min_objective(nll) -    low = np.array([1e-10]) -    high = np.array([10]) +    low = np.array([1e-10,1e-10]) +    high = np.array([10,10])      opt.set_lower_bounds(low)      opt.set_upper_bounds(high)      opt.set_ftol_abs(1e-10) -    opt.set_initial_step([0.01]) +    opt.set_initial_step([0.01,0.01])      xopt = opt.optimize(x0) +    print("xopt = ", xopt)      nll_xopt = nll(xopt)      print("nll(xopt) = ", nll(xopt)) +    pos = np.empty((10, len(x0)),dtype=np.double) +    for i in range(pos.shape[0]): +        pos[i] = xopt + np.random.randn(len(x0))*0.1 +        pos[i,:] = np.clip(pos[i,:],1e-10,10) + +    nwalkers, ndim = pos.shape + +    sampler = emcee.EnsembleSampler(nwalkers, ndim, lambda x: -nll(x)) +    with np.errstate(invalid='ignore'): +        sampler.run_mcmc(pos, args.steps) + +    print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction))) + +    try: +        print("autocorrelation time: ", sampler.get_autocorr_time(quiet=True)) +    except Exception as e: +        print(e) + +    samples = sampler.chain.reshape((-1,len(x0))) + +    plt.figure() +    plt.subplot(2,2,1) +    plt.hist(samples[:,0],bins=100,histtype='step') +    plt.xlabel("Atmospheric Flux Scale") +    plt.subplot(2,2,2) +    plt.hist(samples[:,1],bins=100,histtype='step') +    plt.xlabel("Energy Scale") +      prob = {}      for id in (20,22,2020,2022,2222): -        prob[id] = get_multinomial_prob(data_hists[id],mc_hists[id]*xopt[0]) +        prob[id] = get_multinomial_prob(data[data.id == id].ke.values,mc[mc.id == id].ke.values,samples,bins)          print(id, prob[id])      prob_atm = {}      for id in (20,22,2020,2022,2222): -        prob_atm[id] = get_multinomial_prob(data_atm_hists[id],mc_atm_hists[id]*xopt[0]) +        prob_atm[id] = get_multinomial_prob(atm[atm.id == id].ke.values,atm_mc[atm_mc.id == id].ke.values,samples,bins)          print(id, prob_atm[id])      handles = [Line2D([0], [0], color='C0'), @@ -261,8 +333,8 @@ if __name__ == '__main__':      labels = ('Data','Monte Carlo')      fig = plt.figure() -    plot_hist2(prompt,color='C0') -    plot_hist2(prompt_mc,norm=xopt[0],color='C1') +    plot_hist2(prompt,scale=xopt[1],color='C0') +    plot_hist2(prompt_mc,norm=xopt[0]/100,color='C1')      for id in (20,22,2020,2022,2222):          if id == 20:              plt.subplot(2,3,1) @@ -284,8 +356,8 @@ if __name__ == '__main__':      else:          plt.suptitle("Without Neutron Follower")      fig = plt.figure() -    plot_hist2(atm,color='C0') -    plot_hist2(atm_mc,norm=xopt[0],color='C1') +    plot_hist2(atm,scale=xopt[1],color='C0') +    plot_hist2(atm_mc,norm=xopt[0]/100,color='C1')      for id in (20,22,2020,2022,2222):          if id == 20:              plt.subplot(2,3,1) | 
