diff options
Diffstat (limited to 'utils')
| -rwxr-xr-x | utils/chi2 | 213 | 
1 files changed, 110 insertions, 103 deletions
@@ -36,13 +36,15 @@ from sddm.stats import *  # Uncertainty on the energy scale  # FIXME: Should get real number from stopping muons -ENERGY_SCALE_MEAN = 1.0 -ENERGY_SCALE_UNCERTAINTY = 0.1 +ENERGY_SCALE_MEAN = {'e': 1.0, 'u': 1.0, 'eu': 1.0} +ENERGY_SCALE_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1} +ENERGY_RESOLUTION_MEAN = {'e': 0.0, 'u': 0.0, 'eu': 0.0} +ENERGY_RESOLUTION_UNCERTAINTY = {'e':0.1, 'u': 0.1, 'eu': 0.1}  particle_id = {20: 'e', 22: r'\mu'} -def plot_hist2(df, norm=1.0, scale=1.0, color=None): -    for id, df_id in sorted(df.groupby('id')): +def plot_hist2(hists, bins, color=None): +    for id in (20,22,2020,2022,2222):          if id == 20:              plt.subplot(2,3,1)          elif id == 22: @@ -54,8 +56,8 @@ def plot_hist2(df, norm=1.0, scale=1.0, color=None):          elif id == 2222:              plt.subplot(2,3,6) -        bins = np.logspace(np.log10(20),np.log10(10e3),21) -        plt.hist(df_id.ke.values*scale, bins=bins, histtype='step', weights=np.tile(norm,len(df_id.ke.values)),color=color) +        bincenters = (bins[1:] + bins[:-1])/2 +        plt.hist(bincenters, bins=bins, histtype='step', weights=hists[id],color=color)          plt.gca().set_xscale("log")          plt.xlabel("Energy (MeV)")          plt.title('$' + ''.join([particle_id[int(''.join(x))] for x in grouper(str(id),2)]) + '$') @@ -63,69 +65,85 @@ def plot_hist2(df, norm=1.0, scale=1.0, color=None):      if len(df):          plt.tight_layout() -def plot_hist(df, muons=False): -    for id, df_id in sorted(df.groupby('id')): -        if id == 20: -            plt.subplot(3,4,1) -        elif id == 22: -            plt.subplot(3,4,2) -        elif id == 2020: -            plt.subplot(3,4,5) +def get_mc_hists(data,x,bins,apply_norm=False): +    mc_hists = {} + +    # FIXME: May need to increase number of bins here +    bins2 = np.logspace(np.log10(20),np.log10(10e3),1000) +    bincenters2 = (bins2[1:] + bins2[:-1])/2 + +    for id in (20,22,2020,2022,2222): +        ke = data[data.id == id].ke.values + +        if id in (20,2020): +            ke = ke*x[1] +            scale = bincenters2*x[2] +        elif id in (22,2222): +            ke = ke*x[3] +            scale = bincenters2*x[4]          elif id == 2022: -            plt.subplot(3,4,6) -        elif id == 2222: -            plt.subplot(3,4,7) -        elif id == 202020: -            plt.subplot(3,4,9) -        elif id == 202022: -            plt.subplot(3,4,10) -        elif id == 202222: -            plt.subplot(3,4,11) -        elif id == 222222: -            plt.subplot(3,4,12) - -        if muons: -            plt.hist(np.log10(df_id.ke.values/1000), bins=np.linspace(0,4.5,100), histtype='step') -            plt.xlabel("log10(Energy (GeV))") -        else: -            plt.hist(df_id.ke.values, bins=np.linspace(20,10e3,100), histtype='step') -            plt.xlabel("Energy (MeV)") -        plt.title(str(id)) +            ke = ke*x[5] +            scale = bincenters2*x[6] -    if len(df): -        plt.tight_layout() +        hist = np.histogram(ke,bins=bins2)[0] + +        cdf = norm.cdf(bins[:,np.newaxis],bincenters2,scale)*hist +        mc_hists[id] = np.sum(cdf[1:] - cdf[:-1],axis=-1) +        if apply_norm: +            mc_hists[id] *= x[0] +    return mc_hists + +def get_data_hists(data,bins): +    data_hists = {} +    for id in (20,22,2020,2022,2222): +        df_id = data[data.id == id] +        data_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] +    return data_hists + +# Likelihood Fit Parameters +# 0 - Atmospheric Neutrino Flux Scale +# 1 - Electron energy bias +# 2 - Electron energy resolution +# 3 - Muon energy bias +# 4 - Muon energy resolution +# 5 - Electron + Muon energy bias +# 6 - Electron + Muon energy resolution + +def make_nll(data, mc, bins): +    data_hists = get_data_hists(data,bins) -def make_nll(data, mc_hists):      def nll(x, grad=None): -        if x[0] < 0 or x[1] < 0: +        if any(x[i] < 0 for i in range(len(x))):              return np.inf -        data_hists = {} -        for id in (20,22,2020,2022,2222): -            df_id = data[data.id == id] -            if len(df_id): -                data_hists[id] = np.histogram(df_id.ke.values*x[1],bins=bins)[0] -            else: -                data_hists[id] = np.zeros(len(bins)-1,dtype=np.int) +        mc_hists = get_mc_hists(mc,x,bins,apply_norm=True)          nll = 0          for id in data_hists: -            oi = data_hists[id].sum() +            oi = data_hists[id]              ei = mc_hists[id] + EPSILON -            N = oi.sum() -            n = ei.sum() +            N = ei.sum()              nll -= -N - np.sum(gammaln(oi+1)) + np.sum(oi*np.log(ei)) -        return nll - norm.logpdf(x[1],ENERGY_SCALE_MEAN,ENERGY_SCALE_UNCERTAINTY) +        nll -= norm.logpdf(x[1],ENERGY_SCALE_MEAN['e'],ENERGY_SCALE_UNCERTAINTY['e']) +        nll -= norm.logpdf(x[3],ENERGY_SCALE_MEAN['u'],ENERGY_SCALE_UNCERTAINTY['u']) +        nll -= norm.logpdf(x[5],ENERGY_SCALE_MEAN['eu'],ENERGY_SCALE_UNCERTAINTY['eu']) +        nll -= norm.logpdf(x[2],ENERGY_RESOLUTION_MEAN['e'],ENERGY_RESOLUTION_UNCERTAINTY['e']) +        nll -= norm.logpdf(x[4],ENERGY_RESOLUTION_MEAN['u'],ENERGY_RESOLUTION_UNCERTAINTY['u']) +        nll -= norm.logpdf(x[6],ENERGY_RESOLUTION_MEAN['eu'],ENERGY_RESOLUTION_UNCERTAINTY['eu']) +        print("nll = %.2f" % nll) +        return nll      return nll -def get_mc_hist(data_mc,data_hist,x,bins): -    hist = np.histogram(data_mc,bins=bins)[0] -    return get_mc_hist_posterior(hist,data_hist,norm=x[0]/100.0) +def get_mc_hists_posterior(data,data_hists,x,bins): +    mc_hists = get_mc_hists(data,x,bins) +    for id in (20,22,2020,2022,2222): +        mc_hists[id] = get_mc_hist_posterior(mc_hists[id],data_hists[id],norm=x[0]) +    return mc_hists  def get_data_hist(data,x,bins): -    return np.histogram(data*x[1],bins=bins)[0] +    return np.histogram(data,bins=bins)[0] -def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=10000): +def get_multinomial_prob(data, data_mc, id, x_samples, bins, percentile=99.0, size=10000):      """      Returns the p-value that the histogram of the data is drawn from the MC      histogram. @@ -145,16 +163,17 @@ def get_multinomial_prob(data, data_mc, x_samples, bins, percentile=99.0, size=1          bins: bins used to bin the mc histogram          size: number of values to compute      """ +    data_hists = get_data_hists(data,bins) +      ps = []      for i in range(size):          x = x_samples[np.random.randint(x_samples.shape[0])] -        data_hist = get_data_hist(data,x,bins) -        mc = get_mc_hist(data_mc,data_hist,x,bins) +        mc = get_mc_hists_posterior(data_mc,data_hists,x,bins)[id]          N = mc.sum()          # Fix a bug in scipy(). See https://github.com/scipy/scipy/issues/8235 (I think).          mc = mc + 1e-10          p = mc/mc.sum() -        chi2_data = nllr(data_hist,mc) +        chi2_data = nllr(data_hists[id],mc)          # To draw the multinomial samples we first draw the expected number of          # events from a Poisson distribution and then loop over the counts and          # unique values. The reason we do this is that you can't call @@ -254,59 +273,28 @@ if __name__ == '__main__':      mc = prompt_mc      bins = np.logspace(np.log10(20),np.log10(10e3),21) -    data_hists = {} -    mc_hists = {} -    for id in (20,22,2020,2022,2222): -        df_id = data[data.id == id] -        if len(df_id): -            data_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] -        else: -            data_hists[id] = np.zeros(len(bins)-1,dtype=np.int) -    for id in (20,22,2020,2022,2222): -        df_id = mc[mc.id == id] -        if len(df_id): -            mc_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100 -        else: -            mc_hists[id] = np.zeros(len(bins)-1,dtype=np.int) - -    data_atm_hists = {} -    mc_atm_hists = {} -    for id in (20,22,2020,2022,2222): -        df_id = atm[atm.id == id] -        if len(df_id): -            data_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0] -        else: -            data_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int) +    nll = make_nll(data,mc,bins) -    for id in (20,22,2020,2022,2222): -        df_id = atm_mc[atm_mc.id == id] -        if len(df_id): -            mc_atm_hists[id] = np.histogram(df_id.ke.values,bins=bins)[0]/100 -        else: -            mc_atm_hists[id] = np.zeros(len(bins)-1,dtype=np.int) - -    nll = make_nll(data,mc_hists) - -    x0 = np.array([1.0,1.0]) +    x0 = np.array([1.0,1.0,EPSILON,1.0,EPSILON,1.0,EPSILON])      opt = nlopt.opt(nlopt.LN_SBPLX, len(x0))      opt.set_min_objective(nll) -    low = np.array([1e-10,1e-10]) -    high = np.array([10,10]) +    low = np.array([EPSILON]*len(x0)) +    high = np.array([10]*len(x0))      opt.set_lower_bounds(low)      opt.set_upper_bounds(high)      opt.set_ftol_abs(1e-10) -    opt.set_initial_step([0.01,0.01]) +    opt.set_initial_step([0.01]*len(x0))      xopt = opt.optimize(x0)      print("xopt = ", xopt)      nll_xopt = nll(xopt)      print("nll(xopt) = ", nll(xopt)) -    pos = np.empty((10, len(x0)),dtype=np.double) +    pos = np.empty((20, len(x0)),dtype=np.double)      for i in range(pos.shape[0]):          pos[i] = xopt + np.random.randn(len(x0))*0.1 -        pos[i,:] = np.clip(pos[i,:],1e-10,10) +        pos[i,:] = np.clip(pos[i,:],low,high)      nwalkers, ndim = pos.shape @@ -324,21 +312,36 @@ if __name__ == '__main__':      samples = sampler.chain.reshape((-1,len(x0)))      plt.figure() -    plt.subplot(2,2,1) +    plt.subplot(3,3,1)      plt.hist(samples[:,0],bins=100,histtype='step')      plt.xlabel("Atmospheric Flux Scale") -    plt.subplot(2,2,2) +    plt.subplot(3,3,2) +    plt.hist(samples[:,1],bins=100,histtype='step') +    plt.xlabel("Electron Energy Scale") +    plt.subplot(3,3,3) +    plt.hist(samples[:,1],bins=100,histtype='step') +    plt.xlabel("Electron Energy Resolution") +    plt.subplot(3,3,4) +    plt.hist(samples[:,1],bins=100,histtype='step') +    plt.xlabel("Muon Energy Scale") +    plt.subplot(3,3,5) +    plt.hist(samples[:,1],bins=100,histtype='step') +    plt.xlabel("Muon Energy Resolution") +    plt.subplot(3,3,6) +    plt.hist(samples[:,1],bins=100,histtype='step') +    plt.xlabel("Electron + Muon Energy Scale") +    plt.subplot(3,3,7)      plt.hist(samples[:,1],bins=100,histtype='step') -    plt.xlabel("Energy Scale") +    plt.xlabel("Electron + Muon Energy Resolution")      prob = {}      for id in (20,22,2020,2022,2222): -        prob[id] = get_multinomial_prob(data[data.id == id].ke.values,mc[mc.id == id].ke.values,samples,bins) +        prob[id] = get_multinomial_prob(data,mc,id,samples,bins)          print(id, prob[id])      prob_atm = {}      for id in (20,22,2020,2022,2222): -        prob_atm[id] = get_multinomial_prob(atm[atm.id == id].ke.values,atm_mc[atm_mc.id == id].ke.values,samples,bins) +        prob_atm[id] = get_multinomial_prob(atm,atm_mc,id,samples,bins)          print(id, prob_atm[id])      handles = [Line2D([0], [0], color='C0'), @@ -346,8 +349,10 @@ if __name__ == '__main__':      labels = ('Data','Monte Carlo')      fig = plt.figure() -    plot_hist2(prompt,scale=xopt[1],color='C0') -    plot_hist2(prompt_mc,norm=xopt[0]/100,color='C1') +    hists = get_data_hists(prompt,bins) +    hists_mc = get_mc_hists(prompt_mc,xopt,bins,apply_norm=True) +    plot_hist2(hists,bins=bins,color='C0') +    plot_hist2(hists_mc,bins=bins,color='C1')      for id in (20,22,2020,2022,2222):          if id == 20:              plt.subplot(2,3,1) @@ -369,8 +374,10 @@ if __name__ == '__main__':      else:          plt.suptitle("Without Neutron Follower")      fig = plt.figure() -    plot_hist2(atm,scale=xopt[1],color='C0') -    plot_hist2(atm_mc,norm=xopt[0]/100,color='C1') +    hists = get_data_hists(atm,bins) +    hists_mc = get_mc_hists(atm_mc,xopt,bins,apply_norm=True) +    plot_hist2(hists,bins=bins,color='C0') +    plot_hist2(hists_mc,bins=bins,color='C1')      for id in (20,22,2020,2022,2222):          if id == 20:              plt.subplot(2,3,1)  | 
