갈루아의 반서재

728x90

R version 3.1.2 (2014-10-31) -- "Pumpkin Helmet"

Copyright (C) 2014 The R Foundation for Statistical Computing

Platform: i386-w64-mingw32/i386 (32-bit)

 

R is free software and comes with ABSOLUTELY NO WARRANTY.

You are welcome to redistribute it under certain conditions.

Type 'license()' or 'licence()' for distribution details.

 

R is a collaborative project with many contributors.

Type 'contributors()' for more information and

'citation()' on how to cite R or R packages in publications.

 

Type 'demo()' for some demos, 'help()' for on-line help, or

'help.start()' for an HTML browser interface to help.

Type 'q()' to quit R.

 

[Workspace loaded from ~/.RData]

 

> dspath <- "http://rattle.togaware.com/weather.csv"

> weather <- read.csv(dspath)

> library(rattle) // for the weather dataset and normVarNames()

Rattle: A free graphical interface for data mining with R.

Version 3.3.0 Copyright (c) 2006-2014 Togaware Pty Ltd.

Type 'rattle()' to shake, rattle, and roll your data.

 

Attaching package: ‘rattle’

 

The following object is masked _by_ ‘.GlobalEnv’:

 

    weather

 

> dim(weather)

[1] 366  24

> dsname <- "weather"

> ds <- get(dsname)

> dim(ds)

[1] 366  24

> class(ds)

[1] "data.frame"

> ds <- tbl_df(ds)

Error: could not find function "tbl_df" // dplyr 실행

>  library(dplyr)

 

Attaching package: ‘dplyr’

 

The following object is masked from ‘package:stats’:

 

    filter

 

The following objects are masked from ‘package:base’:

 

    intersect, setdiff, setequal, union

 

> ds <- tbl_df(ds)

> class(ds)

[1] "tbl_df"     "tbl"        "data.frame"

> names(ds)

 [1] "Date"          "Location"      "MinTemp"      

 [4] "MaxTemp"       "Rainfall"      "Evaporation"  

 [7] "Sunshine"      "WindGustDir"   "WindGustSpeed"

[10] "WindDir9am"    "WindDir3pm"    "WindSpeed9am" 

[13] "WindSpeed3pm"  "Humidity9am"   "Humidity3pm"  

[16] "Pressure9am"   "Pressure3pm"   "Cloud9am"     

[19] "Cloud3pm"      "Temp9am"       "Temp3pm"      

[22] "RainToday"     "RISK_MM"       "RainTomorrow" 

> names(ds) <- normVarNames(names(ds))

> names(ds)

 [1] "date"            "location"        "min_temp"       

 [4] "max_temp"        "rainfall"        "evaporation"    

 [7] "sunshine"        "wind_gust_dir"   "wind_gust_speed"

[10] "wind_dir_9am"    "wind_dir_3pm"    "wind_speed_9am" 

[13] "wind_speed_3pm"  "humidity_9am"    "humidity_3pm"   

[16] "pressure_9am"    "pressure_3pm"    "cloud_9am"      

[19] "cloud_3pm"       "temp_9am"        "temp_3pm"       

[22] "rain_today"      "risk_mm"         "rain_tomorrow"  

> sapply(ds, class)

           date        location        min_temp        max_temp 

       "factor"        "factor"       "numeric"       "numeric" 

       rainfall     evaporation        sunshine   wind_gust_dir 

      "numeric"       "numeric"       "numeric"        "factor" 

wind_gust_speed    wind_dir_9am    wind_dir_3pm  wind_speed_9am 

      "integer"        "factor"        "factor"       "integer" 

 wind_speed_3pm    humidity_9am    humidity_3pm    pressure_9am 

      "integer"       "integer"       "integer"       "numeric" 

   pressure_3pm       cloud_9am       cloud_3pm        temp_9am 

      "numeric"       "integer"       "integer"       "numeric" 

       temp_3pm      rain_today         risk_mm   rain_tomorrow 

      "numeric"        "factor"       "numeric"        "factor" 

> library(lubridate)

> head(ds$date)

[1] 2007-11-01 2007-11-02 2007-11-03 2007-11-04 2007-11-05

[6] 2007-11-06

366 Levels: 2007-11-01 2007-11-02 2007-11-03 ... 2008-10-31

> ds$date <- ymd(as.character(ds$date))

Error in gsub("+", "*", fixed = T, gsub(">", "_e>", num)) : invalid multibyte string at '<ec><98><a4>?<84>|<ec>삤<ed>썑)(?![[:alpha:]]))|((?<H_s_e>2[0-4]|[01]?\d)\D+(?<M_s_e>[0-5]?\d)\D+((?<OS_s_S_e>[0-5]?\d\.\d+)|(?<S_s_e>[0-6]?\d))))'

> Sys.setlocale("LC_TIME", "usa") 

[1] "English_United States.1252"

> ds$date <- ymd(as.character(ds$date))

> head(ds$date)

[1] "2007-11-01 UTC" "2007-11-02 UTC" "2007-11-03 UTC"

[4] "2007-11-04 UTC" "2007-11-05 UTC" "2007-11-06 UTC"

> sapply(ds,class)

$date

[1] "POSIXct" "POSIXt" 

 

$location

[1] "factor"

 

$min_temp

[1] "numeric"

 

$max_temp

[1] "numeric"

 

$rainfall

[1] "numeric"

 

$evaporation

[1] "numeric"

 

$sunshine

[1] "numeric"

 

$wind_gust_dir

[1] "factor"

 

$wind_gust_speed

[1] "integer"

 

$wind_dir_9am

[1] "factor"

 

$wind_dir_3pm

[1] "factor"

 

$wind_speed_9am

[1] "integer"

 

$wind_speed_3pm

[1] "integer"

 

$humidity_9am

[1] "integer"

 

$humidity_3pm

[1] "integer"

 

$pressure_9am

[1] "numeric"

 

$pressure_3pm

[1] "numeric"

 

$cloud_9am

[1] "integer"

 

$cloud_3pm

[1] "integer"

 

$temp_9am

[1] "numeric"

 

$temp_3pm

[1] "numeric"

 

$rain_today

[1] "factor"

 

$risk_mm

[1] "numeric"

 

$rain_tomorrow

[1] "factor"

 

> (vars <- names(ds))

 [1] "date"            "location"        "min_temp"       

 [4] "max_temp"        "rainfall"        "evaporation"    

 [7] "sunshine"        "wind_gust_dir"   "wind_gust_speed"

[10] "wind_dir_9am"    "wind_dir_3pm"    "wind_speed_9am" 

[13] "wind_speed_3pm"  "humidity_9am"    "humidity_3pm"   

[16] "pressure_9am"    "pressure_3pm"    "cloud_9am"      

[19] "cloud_3pm"       "temp_9am"        "temp_3pm"       

[22] "rain_today"      "risk_mm"         "rain_tomorrow"  

> target <- "rain_tomorrow"

> risk <- "risk_nm"

> id <- c("date","locattion")

> ignore <- union(id, if (exists("risk")) risk)

> (ids <- which(sapply(ds, function(x) length(unique(x))) == nrow(ds)))

date 

   1 

> ignore <- union(ignore, names(ids))

> mvc <- sapply(ds[vars], function(x) sum(is.na(x)))

> mvn <- names(which(mvc == nrow(ds)))

> ignore <- union(ignore, mvn)

> factors <- which(sapply(ds[vars], is.factor))

> lvls <- sapply(factors, function(x) length(levels(ds[[x]])))

> (many <- names(which(lvls > 20)))

character(0)

> ignore <- union(ignore, many)

> (constants <- names(which(sapply(ds[vars], function(x) all(x == x[1L])))))

[1] "location"

> ignore <- union(ignore, constants)

> mc <- cor(ds[which(sapply(ds, is.numeric))], use="complete.obs")

> mc[upper.tri(mc, diag=TRUE)] <- NA

> mc <-

+     mc %>%

+     abs() %>%

+     data.frame() %>%

+     mutate(var1=row.names(mc)) %>%

+     gather(var2, cor, -var1) %>%

+     na.omit()

Error in function_list[[i]](value) : could not find function "gather" // tidyr 실행

> library(tidyr)

> mc <-

+     mc %>%

+     abs() %>%

+     data.frame() %>%

+     mutate(var1=row.names(mc)) %>%

+     gather(var2, cor, -var1) %>%

+     na.omit()

> mc <- mc[order(-abs(mc$cor)),]

> mc

               var1            var2         cor

33         temp_3pm        max_temp 0.989104911

182    pressure_3pm    pressure_9am 0.966604486

15         temp_9am        min_temp 0.915643957

32         temp_9am        max_temp 0.869727997

254        temp_3pm        temp_9am 0.843836925

78     humidity_3pm        sunshine 0.760942386

2          max_temp        min_temp 0.749570594

16         temp_3pm        min_temp 0.720308771

66         temp_9am     evaporation 0.703483280

93   wind_speed_3pm wind_gust_speed 0.694428277

81        cloud_9am        sunshine 0.688459327

21      evaporation        max_temp 0.686875021

67         temp_3pm     evaporation 0.668503863

82        cloud_3pm        sunshine 0.662973208

4       evaporation        min_temp 0.645910281

169        temp_3pm    humidity_3pm 0.583883827

166       cloud_9am    humidity_3pm 0.550244527

146    humidity_3pm    humidity_9am 0.543593842

27     humidity_3pm        max_temp 0.535072516

218       cloud_3pm       cloud_9am 0.533035334

92   wind_speed_9am wind_gust_speed 0.527679463

96     pressure_9am wind_gust_speed 0.526678523

60     humidity_9am     evaporation 0.520244792

167       cloud_3pm    humidity_3pm 0.519007021

97     pressure_3pm wind_gust_speed 0.513143882

77     humidity_9am        sunshine 0.498336604

11     pressure_9am        min_temp 0.489930431

12     pressure_3pm        min_temp 0.486058595

202        temp_9am    pressure_3pm 0.481429319

110  wind_speed_3pm  wind_speed_9am 0.473694763

84         temp_3pm        sunshine 0.472675244

22         sunshine        max_temp 0.453476787

185        temp_9am    pressure_9am 0.447380426

151        temp_9am    humidity_9am 0.433867903

149       cloud_9am    humidity_9am 0.396590230

61     humidity_3pm     evaporation 0.390128366

63     pressure_3pm     evaporation 0.382414785

85          risk_mm        sunshine 0.382262459

62     pressure_9am     evaporation 0.371158773

29     pressure_3pm        max_temp 0.365461817

170         risk_mm    humidity_3pm 0.361181871

26     humidity_9am        max_temp 0.359175280

152        temp_3pm    humidity_9am 0.355676884

113    pressure_9am  wind_speed_9am 0.354668850

130    pressure_9am  wind_speed_3pm 0.345995510

94     humidity_9am wind_gust_speed 0.342872753

203        temp_3pm    pressure_3pm 0.330822856

45     pressure_9am        rainfall 0.330614686

238         risk_mm       cloud_3pm 0.326167731

204         risk_mm    pressure_3pm 0.323693375

131    pressure_3pm  wind_speed_3pm 0.321248099

56         sunshine     evaporation 0.319733531

187         risk_mm    pressure_9am 0.300552956

44     humidity_3pm        rainfall 0.289866818

150       cloud_3pm    humidity_9am 0.274796226

28     pressure_9am        max_temp 0.274490408

221         risk_mm       cloud_9am 0.273442955

57  wind_gust_speed     evaporation 0.273276328

128    humidity_9am  wind_speed_3pm 0.271286203

111    humidity_9am  wind_speed_9am 0.268865288

102         risk_mm wind_gust_speed 0.254011131

168        temp_9am    humidity_3pm 0.252727502

46     pressure_3pm        rainfall 0.245570340

114    pressure_3pm  wind_speed_9am 0.244179410

186        temp_3pm    pressure_9am 0.236701884

100        temp_9am wind_gust_speed 0.236505052

118        temp_3pm  wind_speed_9am 0.234737436

41   wind_speed_9am        rainfall 0.225298144

17          risk_mm        min_temp 0.216705595

24   wind_speed_9am        max_temp 0.216321537

83         temp_9am        sunshine 0.215557453

220        temp_3pm       cloud_9am 0.207812168

13        cloud_9am        min_temp 0.207525893

135        temp_3pm  wind_speed_3pm 0.205110667

9      humidity_9am        min_temp 0.205012999

6   wind_gust_speed        min_temp 0.198151856

3          rainfall        min_temp 0.192628416

237        temp_3pm       cloud_3pm 0.187145348

25   wind_speed_3pm        max_temp 0.185232456

30        cloud_9am        max_temp 0.181573830

153         risk_mm    humidity_9am 0.174180951

47        cloud_9am        rainfall 0.171006936

255         risk_mm        temp_9am 0.161798361

183       cloud_9am    pressure_9am 0.155863069

39         sunshine        rainfall 0.154580799

43     humidity_9am        rainfall 0.149969217

112    humidity_3pm  wind_speed_9am 0.149609289

31        cloud_3pm        max_temp 0.149130708

201       cloud_3pm    pressure_3pm 0.144191377

184       cloud_3pm    pressure_9am 0.141709426

48        cloud_3pm        rainfall 0.133919974

7    wind_speed_9am        min_temp 0.129732343

200       cloud_9am    pressure_3pm 0.127459183

148    pressure_3pm    humidity_9am 0.125857952

147    pressure_9am    humidity_9am 0.125495143

64        cloud_9am     evaporation 0.111178769

65        cloud_3pm     evaporation 0.110823614

14        cloud_3pm        min_temp 0.110552687

164    pressure_9am    humidity_3pm 0.107773873

115       cloud_9am  wind_speed_9am 0.103277486

50         temp_3pm        rainfall 0.097632603

51          risk_mm        rainfall 0.093072839

40  wind_gust_speed        rainfall 0.090494182

23  wind_gust_speed        max_temp 0.088567284

74  wind_gust_speed        sunshine 0.087149978

8    wind_speed_3pm        min_temp 0.086365931

20         rainfall        max_temp 0.084264456

68          risk_mm     evaporation 0.075792244

58   wind_speed_9am     evaporation 0.069400953

49         temp_9am        rainfall 0.068356449

76   wind_speed_3pm        sunshine 0.065595650

117        temp_9am  wind_speed_9am 0.064897427

75   wind_speed_9am        sunshine 0.064851278

95     humidity_3pm wind_gust_speed 0.057562893

101        temp_3pm wind_gust_speed 0.051323388

99        cloud_3pm wind_gust_speed 0.050336070

42   wind_speed_3pm        rainfall 0.045619506

59   wind_speed_3pm     evaporation 0.040822125

119         risk_mm  wind_speed_9am 0.040703027

10     humidity_3pm        min_temp 0.038885065

134        temp_9am  wind_speed_3pm 0.035252600

132       cloud_9am  wind_speed_3pm 0.035169830

236        temp_9am       cloud_3pm 0.032936656

5          sunshine        min_temp 0.029220454

165    pressure_3pm    humidity_3pm 0.027212497

80     pressure_3pm        sunshine 0.026925620

34          risk_mm        max_temp 0.025597655

116       cloud_3pm  wind_speed_9am 0.021214450

129    humidity_3pm  wind_speed_3pm 0.020601006

38      evaporation        rainfall 0.016800144

79     pressure_9am        sunshine 0.016347688

98        cloud_9am wind_gust_speed 0.014943389

219        temp_9am       cloud_9am 0.012769959

136         risk_mm  wind_speed_3pm 0.009388677

272         risk_mm        temp_3pm 0.007491706

133       cloud_3pm  wind_speed_3pm 0.006195496

> ignore <- union(ignore, c("temp_3pm", "pressure_9am", "temp_9am"))

> length(vars)

[1] 24

> vars <- setdiff(vars,ignore)

> length(vars)

[1] 19

> library(FSelector)

> form <- formula(paste(target,"~ ."))

> cfs(form,ds[vars])

[1] "risk_mm"

> information.gain(form, ds[vars])

                attr_importance

min_temp           3.539250e-02

max_temp           0.000000e+00

rainfall           0.000000e+00

evaporation        0.000000e+00

sunshine           6.523179e-02

wind_gust_dir      4.073802e-02

wind_gust_speed    3.931861e-02

wind_dir_9am       3.537000e-02

wind_dir_3pm       1.759904e-02

wind_speed_9am     9.813415e-05

wind_speed_3pm     0.000000e+00

humidity_9am       2.858310e-02

humidity_3pm       6.189702e-02

pressure_3pm       6.878745e-02

cloud_9am          3.314110e-02

cloud_3pm          6.893149e-02

rain_today         1.261390e-02

risk_mm            4.718903e-01

> dim(ds)

[1] 366  24

> sum(is.na(ds[target]))

[1] 0

> ds <- ds[!is.na(ds[target]),]

> sum(is.na(ds[target]))

[1] 0

> dim(ds)

[1] 366  24

> factors <- which(sapply(ds[vars], is.factor))

> for (f in factors) levels(ds[[f]]) <- normVarNames(levels(ds[[f]]))

> ds[target] <- as.factor(ds[[target]])

> table(ds[target])

 

 No Yes 

300  66 

 

> inputc <- setdiff(vars,target)

> inputc

 [1] "min_temp"        "max_temp"        "rainfall"       

 [4] "evaporation"     "sunshine"        "wind_gust_dir"  

 [7] "wind_gust_speed" "wind_dir_9am"    "wind_dir_3pm"   

[10] "wind_speed_9am"  "wind_speed_3pm"  "humidity_9am"   

[13] "humidity_3pm"    "pressure_3pm"    "cloud_9am"      

[16] "cloud_3pm"       "rain_today"      "risk_mm"        

> inputi <- sapply(inputc, function(x) which (x== names(ds)), USE.NAMES=FALSE)

> inputi

 [1]  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 22 23

> nobs <- nrow(ds)

> nobs

[1] 366

> dim(ds)

[1] 366  24

> dim(ds[vars])

[1] 366  19

> dim(ds[inputc])

[1] 366  18

> dim(ds[inputi])

[1] 366  18

> intersect(inputi, which(sapply(ds, is.numeric)))

 [1]  3  4  5  6  7  9 12 13 14 15 17 18 19 23

> which(sapply(ds, is.numeric))

       min_temp        max_temp        rainfall     evaporation 

              3               4               5               6 

       sunshine wind_gust_speed  wind_speed_9am  wind_speed_3pm 

              7               9              12              13 

   humidity_9am    humidity_3pm    pressure_9am    pressure_3pm 

             14              15              16              17 

      cloud_9am       cloud_3pm        temp_9am        temp_3pm 

             18              19              20              21 

        risk_mm 

             23 

> numi <- intersect(inputi, which(sapply(ds, is.numeric)))

> numi

 [1]  3  4  5  6  7  9 12 13 14 15 17 18 19 23

> names(ds)[numi]

 [1] "min_temp"        "max_temp"        "rainfall"       

 [4] "evaporation"     "sunshine"        "wind_gust_speed"

 [7] "wind_speed_9am"  "wind_speed_3pm"  "humidity_9am"   

[10] "humidity_3pm"    "pressure_3pm"    "cloud_9am"      

[13] "cloud_3pm"       "risk_mm"        

> numc <- names(ds)[numi]

> numc

 [1] "min_temp"        "max_temp"        "rainfall"       

 [4] "evaporation"     "sunshine"        "wind_gust_speed"

 [7] "wind_speed_9am"  "wind_speed_3pm"  "humidity_9am"   

[10] "humidity_3pm"    "pressure_3pm"    "cloud_9am"      

[13] "cloud_3pm"       "risk_mm"        

> which(sapply(ds, is.factor))

     location wind_gust_dir  wind_dir_9am  wind_dir_3pm    rain_today 

            2             8            10            11            22 

rain_tomorrow 

           24 

> intersect(inputi, which(sapply(ds, is.factor)))

[1]  8 10 11 22

> cati <- intersect(inputi, which(sapply(ds, is.factor)))

> cati

[1]  8 10 11 22

> names(ds)[cati]

[1] "wind_gust_dir" "wind_dir_9am"  "wind_dir_3pm"  "rain_today"   

> catc <- names(ds)[cati]

> catc

[1] "wind_gust_dir" "wind_dir_9am"  "wind_dir_3pm"  "rain_today"   

  

> ds <- ds[!is.na(ds[target]),]

> if(sum(is.na(ds[vars]))) ds[vars] <- na.roughfix(ds[vars])

> omit <- NULL

> mo <- attr(na.omit(ds[vars]), "na.action")

> omit <- union(omit,mo)

> if(length(omit)) ds <- ds[-omit,] 

> factors <- which(sapply(ds[vars], is.factor))

> for (f in factors) levels(ds[[f]]) <- normVarNames(levels(ds[[f]])) 

> for (f in factors) levels(ds[[f]]) <- normVarNames(levels(ds[[f]]))

> ds[target] <- as.factor(ds[[target]])

> nobs <- nrow(ds)

> dsdate <- paste0("-", format(Sys.Date(), "%y%m%d"))

> dsrdate <- paste0(dsname, dsdate, ".RData")

> save(ds, dsname, dspath, dsdate, target, risk, id, ignore, vars, nobs, omit, inputi, inputc, numi, numc, cati, catc, file=dsrdata)

 

728x90