R version 3.1.2 (2014-10-31) -- "Pumpkin Helmet"
Copyright (C) 2014 The R Foundation for Statistical Computing
Platform: i386-w64-mingw32/i386 (32-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
[Workspace loaded from ~/.RData]
> dspath <- "http://rattle.togaware.com/weather.csv"
> weather <- read.csv(dspath)
> library(rattle) // for the weather dataset and normVarNames()
Rattle: A free graphical interface for data mining with R.
Version 3.3.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
Type 'rattle()' to shake, rattle, and roll your data.
Attaching package: ‘rattle’
The following object is masked _by_ ‘.GlobalEnv’:
weather
> dim(weather)
[1] 366 24
> dsname <- "weather"
> ds <- get(dsname)
> dim(ds)
[1] 366 24
> class(ds)
[1] "data.frame"
> ds <- tbl_df(ds)
Error: could not find function "tbl_df" // dplyr 실행
> library(dplyr)
Attaching package: ‘dplyr’
The following object is masked from ‘package:stats’:
filter
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
> ds <- tbl_df(ds)
> class(ds)
[1] "tbl_df" "tbl" "data.frame"
> names(ds)
[1] "Date" "Location" "MinTemp"
[4] "MaxTemp" "Rainfall" "Evaporation"
[7] "Sunshine" "WindGustDir" "WindGustSpeed"
[10] "WindDir9am" "WindDir3pm" "WindSpeed9am"
[13] "WindSpeed3pm" "Humidity9am" "Humidity3pm"
[16] "Pressure9am" "Pressure3pm" "Cloud9am"
[19] "Cloud3pm" "Temp9am" "Temp3pm"
[22] "RainToday" "RISK_MM" "RainTomorrow"
> names(ds) <- normVarNames(names(ds))
> names(ds)
[1] "date" "location" "min_temp"
[4] "max_temp" "rainfall" "evaporation"
[7] "sunshine" "wind_gust_dir" "wind_gust_speed"
[10] "wind_dir_9am" "wind_dir_3pm" "wind_speed_9am"
[13] "wind_speed_3pm" "humidity_9am" "humidity_3pm"
[16] "pressure_9am" "pressure_3pm" "cloud_9am"
[19] "cloud_3pm" "temp_9am" "temp_3pm"
[22] "rain_today" "risk_mm" "rain_tomorrow"
> sapply(ds, class)
date location min_temp max_temp
"factor" "factor" "numeric" "numeric"
rainfall evaporation sunshine wind_gust_dir
"numeric" "numeric" "numeric" "factor"
wind_gust_speed wind_dir_9am wind_dir_3pm wind_speed_9am
"integer" "factor" "factor" "integer"
wind_speed_3pm humidity_9am humidity_3pm pressure_9am
"integer" "integer" "integer" "numeric"
pressure_3pm cloud_9am cloud_3pm temp_9am
"numeric" "integer" "integer" "numeric"
temp_3pm rain_today risk_mm rain_tomorrow
"numeric" "factor" "numeric" "factor"
> library(lubridate)
> head(ds$date)
[1] 2007-11-01 2007-11-02 2007-11-03 2007-11-04 2007-11-05
[6] 2007-11-06
366 Levels: 2007-11-01 2007-11-02 2007-11-03 ... 2008-10-31
> ds$date <- ymd(as.character(ds$date))
Error in gsub("+", "*", fixed = T, gsub(">", "_e>", num)) : invalid multibyte string at '<ec><98><a4>?<84>|<ec>삤<ed>썑)(?![[:alpha:]]))|((?<H_s_e>2[0-4]|[01]?\d)\D+(?<M_s_e>[0-5]?\d)\D+((?<OS_s_S_e>[0-5]?\d\.\d+)|(?<S_s_e>[0-6]?\d))))'
> Sys.setlocale("LC_TIME", "usa")
[1] "English_United States.1252"
> ds$date <- ymd(as.character(ds$date))
> head(ds$date)
[1] "2007-11-01 UTC" "2007-11-02 UTC" "2007-11-03 UTC"
[4] "2007-11-04 UTC" "2007-11-05 UTC" "2007-11-06 UTC"
> sapply(ds,class)
$date
[1] "POSIXct" "POSIXt"
$location
[1] "factor"
$min_temp
[1] "numeric"
$max_temp
[1] "numeric"
$rainfall
[1] "numeric"
$evaporation
[1] "numeric"
$sunshine
[1] "numeric"
$wind_gust_dir
[1] "factor"
$wind_gust_speed
[1] "integer"
$wind_dir_9am
[1] "factor"
$wind_dir_3pm
[1] "factor"
$wind_speed_9am
[1] "integer"
$wind_speed_3pm
[1] "integer"
$humidity_9am
[1] "integer"
$humidity_3pm
[1] "integer"
$pressure_9am
[1] "numeric"
$pressure_3pm
[1] "numeric"
$cloud_9am
[1] "integer"
$cloud_3pm
[1] "integer"
$temp_9am
[1] "numeric"
$temp_3pm
[1] "numeric"
$rain_today
[1] "factor"
$risk_mm
[1] "numeric"
$rain_tomorrow
[1] "factor"
> (vars <- names(ds))
[1] "date" "location" "min_temp"
[4] "max_temp" "rainfall" "evaporation"
[7] "sunshine" "wind_gust_dir" "wind_gust_speed"
[10] "wind_dir_9am" "wind_dir_3pm" "wind_speed_9am"
[13] "wind_speed_3pm" "humidity_9am" "humidity_3pm"
[16] "pressure_9am" "pressure_3pm" "cloud_9am"
[19] "cloud_3pm" "temp_9am" "temp_3pm"
[22] "rain_today" "risk_mm" "rain_tomorrow"
> target <- "rain_tomorrow"
> risk <- "risk_nm"
> id <- c("date","locattion")
> ignore <- union(id, if (exists("risk")) risk)
> (ids <- which(sapply(ds, function(x) length(unique(x))) == nrow(ds)))
date
1
> ignore <- union(ignore, names(ids))
> mvc <- sapply(ds[vars], function(x) sum(is.na(x)))
> mvn <- names(which(mvc == nrow(ds)))
> ignore <- union(ignore, mvn)
> factors <- which(sapply(ds[vars], is.factor))
> lvls <- sapply(factors, function(x) length(levels(ds[[x]])))
> (many <- names(which(lvls > 20)))
character(0)
> ignore <- union(ignore, many)
> (constants <- names(which(sapply(ds[vars], function(x) all(x == x[1L])))))
[1] "location"
> ignore <- union(ignore, constants)
> mc <- cor(ds[which(sapply(ds, is.numeric))], use="complete.obs")
> mc[upper.tri(mc, diag=TRUE)] <- NA
> mc <-
+ mc %>%
+ abs() %>%
+ data.frame() %>%
+ mutate(var1=row.names(mc)) %>%
+ gather(var2, cor, -var1) %>%
+ na.omit()
Error in function_list[[i]](value) : could not find function "gather" // tidyr 실행
> library(tidyr)
> mc <-
+ mc %>%
+ abs() %>%
+ data.frame() %>%
+ mutate(var1=row.names(mc)) %>%
+ gather(var2, cor, -var1) %>%
+ na.omit()
> mc <- mc[order(-abs(mc$cor)),]
> mc
var1 var2 cor
33 temp_3pm max_temp 0.989104911
182 pressure_3pm pressure_9am 0.966604486
15 temp_9am min_temp 0.915643957
32 temp_9am max_temp 0.869727997
254 temp_3pm temp_9am 0.843836925
78 humidity_3pm sunshine 0.760942386
2 max_temp min_temp 0.749570594
16 temp_3pm min_temp 0.720308771
66 temp_9am evaporation 0.703483280
93 wind_speed_3pm wind_gust_speed 0.694428277
81 cloud_9am sunshine 0.688459327
21 evaporation max_temp 0.686875021
67 temp_3pm evaporation 0.668503863
82 cloud_3pm sunshine 0.662973208
4 evaporation min_temp 0.645910281
169 temp_3pm humidity_3pm 0.583883827
166 cloud_9am humidity_3pm 0.550244527
146 humidity_3pm humidity_9am 0.543593842
27 humidity_3pm max_temp 0.535072516
218 cloud_3pm cloud_9am 0.533035334
92 wind_speed_9am wind_gust_speed 0.527679463
96 pressure_9am wind_gust_speed 0.526678523
60 humidity_9am evaporation 0.520244792
167 cloud_3pm humidity_3pm 0.519007021
97 pressure_3pm wind_gust_speed 0.513143882
77 humidity_9am sunshine 0.498336604
11 pressure_9am min_temp 0.489930431
12 pressure_3pm min_temp 0.486058595
202 temp_9am pressure_3pm 0.481429319
110 wind_speed_3pm wind_speed_9am 0.473694763
84 temp_3pm sunshine 0.472675244
22 sunshine max_temp 0.453476787
185 temp_9am pressure_9am 0.447380426
151 temp_9am humidity_9am 0.433867903
149 cloud_9am humidity_9am 0.396590230
61 humidity_3pm evaporation 0.390128366
63 pressure_3pm evaporation 0.382414785
85 risk_mm sunshine 0.382262459
62 pressure_9am evaporation 0.371158773
29 pressure_3pm max_temp 0.365461817
170 risk_mm humidity_3pm 0.361181871
26 humidity_9am max_temp 0.359175280
152 temp_3pm humidity_9am 0.355676884
113 pressure_9am wind_speed_9am 0.354668850
130 pressure_9am wind_speed_3pm 0.345995510
94 humidity_9am wind_gust_speed 0.342872753
203 temp_3pm pressure_3pm 0.330822856
45 pressure_9am rainfall 0.330614686
238 risk_mm cloud_3pm 0.326167731
204 risk_mm pressure_3pm 0.323693375
131 pressure_3pm wind_speed_3pm 0.321248099
56 sunshine evaporation 0.319733531
187 risk_mm pressure_9am 0.300552956
44 humidity_3pm rainfall 0.289866818
150 cloud_3pm humidity_9am 0.274796226
28 pressure_9am max_temp 0.274490408
221 risk_mm cloud_9am 0.273442955
57 wind_gust_speed evaporation 0.273276328
128 humidity_9am wind_speed_3pm 0.271286203
111 humidity_9am wind_speed_9am 0.268865288
102 risk_mm wind_gust_speed 0.254011131
168 temp_9am humidity_3pm 0.252727502
46 pressure_3pm rainfall 0.245570340
114 pressure_3pm wind_speed_9am 0.244179410
186 temp_3pm pressure_9am 0.236701884
100 temp_9am wind_gust_speed 0.236505052
118 temp_3pm wind_speed_9am 0.234737436
41 wind_speed_9am rainfall 0.225298144
17 risk_mm min_temp 0.216705595
24 wind_speed_9am max_temp 0.216321537
83 temp_9am sunshine 0.215557453
220 temp_3pm cloud_9am 0.207812168
13 cloud_9am min_temp 0.207525893
135 temp_3pm wind_speed_3pm 0.205110667
9 humidity_9am min_temp 0.205012999
6 wind_gust_speed min_temp 0.198151856
3 rainfall min_temp 0.192628416
237 temp_3pm cloud_3pm 0.187145348
25 wind_speed_3pm max_temp 0.185232456
30 cloud_9am max_temp 0.181573830
153 risk_mm humidity_9am 0.174180951
47 cloud_9am rainfall 0.171006936
255 risk_mm temp_9am 0.161798361
183 cloud_9am pressure_9am 0.155863069
39 sunshine rainfall 0.154580799
43 humidity_9am rainfall 0.149969217
112 humidity_3pm wind_speed_9am 0.149609289
31 cloud_3pm max_temp 0.149130708
201 cloud_3pm pressure_3pm 0.144191377
184 cloud_3pm pressure_9am 0.141709426
48 cloud_3pm rainfall 0.133919974
7 wind_speed_9am min_temp 0.129732343
200 cloud_9am pressure_3pm 0.127459183
148 pressure_3pm humidity_9am 0.125857952
147 pressure_9am humidity_9am 0.125495143
64 cloud_9am evaporation 0.111178769
65 cloud_3pm evaporation 0.110823614
14 cloud_3pm min_temp 0.110552687
164 pressure_9am humidity_3pm 0.107773873
115 cloud_9am wind_speed_9am 0.103277486
50 temp_3pm rainfall 0.097632603
51 risk_mm rainfall 0.093072839
40 wind_gust_speed rainfall 0.090494182
23 wind_gust_speed max_temp 0.088567284
74 wind_gust_speed sunshine 0.087149978
8 wind_speed_3pm min_temp 0.086365931
20 rainfall max_temp 0.084264456
68 risk_mm evaporation 0.075792244
58 wind_speed_9am evaporation 0.069400953
49 temp_9am rainfall 0.068356449
76 wind_speed_3pm sunshine 0.065595650
117 temp_9am wind_speed_9am 0.064897427
75 wind_speed_9am sunshine 0.064851278
95 humidity_3pm wind_gust_speed 0.057562893
101 temp_3pm wind_gust_speed 0.051323388
99 cloud_3pm wind_gust_speed 0.050336070
42 wind_speed_3pm rainfall 0.045619506
59 wind_speed_3pm evaporation 0.040822125
119 risk_mm wind_speed_9am 0.040703027
10 humidity_3pm min_temp 0.038885065
134 temp_9am wind_speed_3pm 0.035252600
132 cloud_9am wind_speed_3pm 0.035169830
236 temp_9am cloud_3pm 0.032936656
5 sunshine min_temp 0.029220454
165 pressure_3pm humidity_3pm 0.027212497
80 pressure_3pm sunshine 0.026925620
34 risk_mm max_temp 0.025597655
116 cloud_3pm wind_speed_9am 0.021214450
129 humidity_3pm wind_speed_3pm 0.020601006
38 evaporation rainfall 0.016800144
79 pressure_9am sunshine 0.016347688
98 cloud_9am wind_gust_speed 0.014943389
219 temp_9am cloud_9am 0.012769959
136 risk_mm wind_speed_3pm 0.009388677
272 risk_mm temp_3pm 0.007491706
133 cloud_3pm wind_speed_3pm 0.006195496
> ignore <- union(ignore, c("temp_3pm", "pressure_9am", "temp_9am"))
> length(vars)
[1] 24
> vars <- setdiff(vars,ignore)
> length(vars)
[1] 19
> library(FSelector)
> form <- formula(paste(target,"~ ."))
> cfs(form,ds[vars])
[1] "risk_mm"
> information.gain(form, ds[vars])
attr_importance
min_temp 3.539250e-02
max_temp 0.000000e+00
rainfall 0.000000e+00
evaporation 0.000000e+00
sunshine 6.523179e-02
wind_gust_dir 4.073802e-02
wind_gust_speed 3.931861e-02
wind_dir_9am 3.537000e-02
wind_dir_3pm 1.759904e-02
wind_speed_9am 9.813415e-05
wind_speed_3pm 0.000000e+00
humidity_9am 2.858310e-02
humidity_3pm 6.189702e-02
pressure_3pm 6.878745e-02
cloud_9am 3.314110e-02
cloud_3pm 6.893149e-02
rain_today 1.261390e-02
risk_mm 4.718903e-01
> dim(ds)
[1] 366 24
> sum(is.na(ds[target]))
[1] 0
> ds <- ds[!is.na(ds[target]),]
> sum(is.na(ds[target]))
[1] 0
> dim(ds)
[1] 366 24
> factors <- which(sapply(ds[vars], is.factor))
> for (f in factors) levels(ds[[f]]) <- normVarNames(levels(ds[[f]]))
> ds[target] <- as.factor(ds[[target]])
> table(ds[target])
No Yes
300 66
> inputc <- setdiff(vars,target)
> inputc
[1] "min_temp" "max_temp" "rainfall"
[4] "evaporation" "sunshine" "wind_gust_dir"
[7] "wind_gust_speed" "wind_dir_9am" "wind_dir_3pm"
[10] "wind_speed_9am" "wind_speed_3pm" "humidity_9am"
[13] "humidity_3pm" "pressure_3pm" "cloud_9am"
[16] "cloud_3pm" "rain_today" "risk_mm"
> inputi <- sapply(inputc, function(x) which (x== names(ds)), USE.NAMES=FALSE)
> inputi
[1] 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19 22 23
> nobs <- nrow(ds)
> nobs
[1] 366
> dim(ds)
[1] 366 24
> dim(ds[vars])
[1] 366 19
> dim(ds[inputc])
[1] 366 18
> dim(ds[inputi])
[1] 366 18
> intersect(inputi, which(sapply(ds, is.numeric)))
[1] 3 4 5 6 7 9 12 13 14 15 17 18 19 23
> which(sapply(ds, is.numeric))
min_temp max_temp rainfall evaporation
3 4 5 6
sunshine wind_gust_speed wind_speed_9am wind_speed_3pm
7 9 12 13
humidity_9am humidity_3pm pressure_9am pressure_3pm
14 15 16 17
cloud_9am cloud_3pm temp_9am temp_3pm
18 19 20 21
risk_mm
23
> numi <- intersect(inputi, which(sapply(ds, is.numeric)))
> numi
[1] 3 4 5 6 7 9 12 13 14 15 17 18 19 23
> names(ds)[numi]
[1] "min_temp" "max_temp" "rainfall"
[4] "evaporation" "sunshine" "wind_gust_speed"
[7] "wind_speed_9am" "wind_speed_3pm" "humidity_9am"
[10] "humidity_3pm" "pressure_3pm" "cloud_9am"
[13] "cloud_3pm" "risk_mm"
> numc <- names(ds)[numi]
> numc
[1] "min_temp" "max_temp" "rainfall"
[4] "evaporation" "sunshine" "wind_gust_speed"
[7] "wind_speed_9am" "wind_speed_3pm" "humidity_9am"
[10] "humidity_3pm" "pressure_3pm" "cloud_9am"
[13] "cloud_3pm" "risk_mm"
> which(sapply(ds, is.factor))
location wind_gust_dir wind_dir_9am wind_dir_3pm rain_today
2 8 10 11 22
rain_tomorrow
24
> intersect(inputi, which(sapply(ds, is.factor)))
[1] 8 10 11 22
> cati <- intersect(inputi, which(sapply(ds, is.factor)))
> cati
[1] 8 10 11 22
> names(ds)[cati]
[1] "wind_gust_dir" "wind_dir_9am" "wind_dir_3pm" "rain_today"
> catc <- names(ds)[cati]
> catc
[1] "wind_gust_dir" "wind_dir_9am" "wind_dir_3pm" "rain_today"
> ds <- ds[!is.na(ds[target]),]
> if(sum(is.na(ds[vars]))) ds[vars] <- na.roughfix(ds[vars])
> omit <- NULL
> mo <- attr(na.omit(ds[vars]), "na.action")
> omit <- union(omit,mo)
> if(length(omit)) ds <- ds[-omit,]
> factors <- which(sapply(ds[vars], is.factor))
> for (f in factors) levels(ds[[f]]) <- normVarNames(levels(ds[[f]]))
> for (f in factors) levels(ds[[f]]) <- normVarNames(levels(ds[[f]]))
> ds[target] <- as.factor(ds[[target]])
> nobs <- nrow(ds)
> dsdate <- paste0("-", format(Sys.Date(), "%y%m%d"))
> dsrdate <- paste0(dsname, dsdate, ".RData")
> save(ds, dsname, dspath, dsdate, target, risk, id, ignore, vars, nobs, omit, inputi, inputc, numi, numc, cati, catc, file=dsrdata)
>
'프로그래밍 Programming' 카테고리의 다른 글
[웹사이트 만들기] (1) 파이썬Python 설치(2.7 버전으로 업그레이드) (0) | 2014.12.10 |
---|---|
아파치apache 버전 확인 방법 (0) | 2014.12.10 |
Data Preparation (19) - Prepare (Save Dataset) (0) | 2014.12.09 |
Data Preparation (18) - Prepare (Numeric and Categoric Variables) (0) | 2014.12.06 |
Data Preparation (17) - Prepare (Variables) (0) | 2014.12.06 |