Basic Work Flow:
- check distribution for each variable for assumptions
- check for outliers and any transformation need to perform (log?)
- fit a linear regression model
- Analyze model fitting (R square)
- predict (un-transform the result if transformation is performed)
# useful packages
import pandas as pd
import json
import statistics
import urllib
import numpy as np
import scipy
import matplotlib.pyplot as plt
# assume features are log-trandformed
X = data["feature1", "feature2"]
X = sm.add_constant(X)
y = data["outcome"]
model = sm.OLS(y, X)
result = model.fit()
print("R Squared is:", round(result.rsquared,3))
Xpred = np.log(range(8, 11))
Xpred = sm.add_constant(Xpred)
print("Predicted results for 10 is :",
round(np.exp(result.predict(Xpred))[2], 3))