# set up file name # look at data in text editor source_data = 'house_prices_in_19083.txt' homes = read.csv2(source_data, stringsAsFactors = FALSE, dec='.') # review structure of homes data set str(homes) # look for weird values / distributions summary(homes) # can edit directly if needed fix(homes) # better to edit in code homes<-homes[!(homes$sqft==1),] # clean out rows with missing data homes = na.omit(homes) # build and assess model lm.allpredictors = lm(price ~ ., data = homes) summary(lm.allpredictors) lm.resid = resid(lm.allpredictors) lm.resid # are residuals distributed as expected? plot(lm.resid) plot(density(resid(lm.allpredictors))) # build a model just based on square feet # plot price vs sqft plot(homes$sqft, homes$price) lm.sqft = lm(price ~ sqft, data = homes) summary(lm.sqft) abline(lm.sqft) # which point is which? identify(homes$sqft, homes$price, paste(homes$price, ' ', homes$sqft)) # predict how much a home will cost predict(lm.allpredictors, data.frame(beds=c(4), baths=c(3), sqft=c(3694), lotsz=c(16553)), interval="prediction") predict(lm.allpredictors, data.frame(beds=c(3), baths=c(1.5), sqft=c(1499), lotsz=c(5271)), interval="prediction")