-
Notifications
You must be signed in to change notification settings - Fork 0
/
bdpair_reg.py
95 lines (83 loc) · 4.04 KB
/
bdpair_reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
from Gau_one import Gau_one
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy as sp
class bdpair_reg(object):
def __init__(self, nameof2beads, filename_list, distance_list, E_bead1, E_bead2):
self.nm = nameof2beads
self.E_bd1 = E_bead1
self.E_bd2 = E_bead2
self.dfavg = pd.DataFrame(filename_list, columns = ['GauFile'])
self.dfavg['Distance/A'] = distance_list
# Create an empty dataframe for all data points
self.dframe = pd.DataFrame(columns = ['Distance/A', self.nm, 'deltaE', 'deltaE /(kcal/mol)'])
# Dataframe for averaged energies (small table)
self.dfavg['deltaE /(kcal/mol)'] = np.nan
#
# Make the wholelist
#
def mk2df(self, v_threshold, saveunselected = False):
self.thsd = v_threshold
# Deal with each Gaussian file
indx = 0 # temp index for distance list
for f in self.dfavg['GauFile']:
SG = Gau_one(f)
SG.searchenergies('SCF Done',self.dfavg['Distance/A'][indx], self.nm, self.E_bd1, self.E_bd2)
self.dframe = self.dframe.append(SG.averageif(self.thsd))
self.dfavg['deltaE /(kcal/mol)'][indx] = SG.avgif
indx += 1
#
if saveunselected:
unsel = SG.df.loc[SG.df['deltaE /(kcal/mol)'] > SG.thsd]
if len(unsel):
unsel.to_excel(f+'_unselected.xlsx')
self.dframe['rij'] = self.dframe['Distance/A'] / 7.11 # rc = 7.11A
self.dframe['Xrij'] = 0.5* (1- self.dframe['rij'] )**2
self.dframe['Uij'] = self.dframe['deltaE /(kcal/mol)'] / 0.5924
self.dfavg['rij'] = self.dfavg['Distance/A'] / 7.11
self.dfavg['Xrij'] = 0.5 * (1 - self.dfavg['rij'])**2
self.dfavg['Uij'] = self.dfavg['deltaE /(kcal/mol)'] / 0.5924
###########################################
# The damned statsmodels is hard to use for the whole dataframe.
# There is always the error when calling rsquared or summary().
# It might be a bug...
# But statsmodels works good with the small-sized 'averaged dataframe', which does not have the duplicated x_value.
def sm_regrss_all(self):
return
#smresult = sm.OLS(self.dframe['Uij'], sm.add_constant(b.dframe['Xrij'])).fit()
def sp_regrss_all(self):
# Instead of statsmodels, use scipy.stats instead.
slope, intercept, r_value, p_value, std_error= sp.stats.linregress(self.dframe['Xrij'], self.dframe['Uij'])
return slope, intercept, r_value**2
#
# Here includes the statsmodels linear regression for 'averaged frame'
def sm_regrss_avg(self):
rslt = sm.OLS(self.dfavg['Uij'], sm.add_constant(self.dfavg['Xrij'])).fit()
return rslt.params, rslt.rsquared
def sp_regrss_avg(self):
slope, intercept, r_value, p_value, std_error= sp.stats.linregress(self.dfavg['Xrij'], self.dfavg['Uij'])
return slope, intercept, r_value**2
def plotall(self):
pass
def plotavg(self):
self.slope, self.intercept, self.rsquare = self.sp_regrss_avg()
xx = sp.linspace( self.dfavg['Xrij'].min() - 0.25 * (self.dfavg['Xrij'].mean() - self.dfavg['Xrij'].min()), \
self.dfavg['Xrij'].max() + 0.25 * (self.dfavg['Xrij'].max() - self.dfavg['Xrij'].mean()), \
50)
yy = xx*self.slope + self.intercept
fig, ax = plt.subplots(figsize = (8, 6))
ax.plot(self.dfavg['Xrij'], self.dfavg['Uij'], 'o', label = 'Uij')
ax.plot(xx, yy, 'r--', label = 'fitted' )
ax.legend(loc = 'best')
fig.savefig(self.nm+'_reg_avg.png')
def writexlsx(self, filename):
self.xlsxfn = filename
writer = pd.ExcelWriter(self.xlsxfn)
self.dfavg.to_excel(writer)
self.df_fitted = pd.DataFrame([self.slope , self.intercept, self.rsquare], index = ['slope', 'intercept', 'rsquared_avg'])
self.df_fitted.to_excel(writer, startcol = 10, startrow = 1, header = False)
writer.save()
#