-
Notifications
You must be signed in to change notification settings - Fork 6
/
RScripting101.R
295 lines (218 loc) · 8.19 KB
/
RScripting101.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# This is a "comment"
# Anything preceeded by a "#" will not be executed
# Objects -----------------------------------------------------------------
# R is an object orientated language, meaning everything you do revolves around the use of,
# you guessed it, objects. This is opposed to procedural programming (like C). An object is
# a data container, which can take a number of forms (known as data structure). An object
# also contains procedures (code), but that is not so useful to explain just yet.
# As an aside you can also do "Functional Programming" in R too
# All objects have a name too. When we name an object, we call it "assignment".
# In R, you should assign with "<-", opposed to every other language where, one
# uses "=". However, unless you are writing packages, you can choose
# e.g.
my.number <- 100
# is the same as
100 -> my.number
# and
my.number = 100
# You should name your objects sensibly, this makes it easy to find them. Here, 100 is the
# object, and "my.number" is the name we are giving it.
my.number
# You can read about naming convention in the Google style guide, along with a whole bunch
# of other "rules" you should try to follow
# https://google-styleguide.googlecode.com/svn/trunk/Rguide.xml
# So what are the data structures -----------------------------------------
# Atomic Vectors
# Atomic Vectors are the "simplest" form of data you usually run into, these are often
# referred to as just "vectors". They are one-dimensional (flat) and have a "length".
# Simplest is one with length 1, which is refered to as a "scalar"
length(my.object)
# Vectors can obviously have any length
my.vector = 1:10
my.vector
my.vector = 1:1000000
head(my.vector)
# Atomic Vectors also have different "data types" they can hold
# There are 4 main data types we should be concerned with:
integer
double # or numeric
logical
character
# Integer is the simplest to imagine
my.integer = integer(10)
my.integer = as.integer(c(1,2,3,4,5))
my.integer = 1:5 # we don't need to coerce, if we're happy to leave it up to R
# Numeric is similer, except it is a floating value
my.numeric = seq(from=0, to=1, by=0.25)
my.numeric
# Logical can take the values TRUE or FALSE
my.logical = c(TRUE,FALSE,TRUE,FALSE,TRUE)
my.logical
# Character, also known as a string, is simply text
my.character = c("one", "two", "three", "four", "five")
my.character
# Additionally, each of these data types can contain an
NA
# Note that in R, NA != (not equal to) NULL, see:
str(c(1,2,3,NA,5))
str(c(1,2,3,NULL,5))
# You can use str() to find out, or is.* to test what type of object/data type somethign is
str(my.integer)
str(my.logical)
is.character(my.character)
is.double(my.numeric)
is.numeric(my.numeric)
# we can "coerce" a vector into another type
as.character(my.integer)
# to demonstrate coercion, looks what happens with
str(c(1, FALSE, "string", 1.3))
str(c(1, 2, 3, 4.5, 6.7))
# We won't talk about attributes here, but check out
?attr # and
?attributes
attr(my.character, "whatami") = "This is a vector of strings"
attr(my.character, "whatami")
attributes(my.character)
# Next we have lists, which are actually a type of vector. Again, they have a length,
# but each element of a list can be of any data structure (even another list)
my.list = list(1:5, seq(from=0, to=1, by=0.25), c(TRUE,FALSE,TRUE,FALSE,TRUE),
c("one", "two", "three", "four", "five"))
str(my.list)
# we could have just used our objects
str(list(my.integer, my.numeric, my.logical, my.character))
# Lists are the main underlying data structure for most complex objects in R
# This includes data frames (which we will see shortly) and model fit objects for example
# Matrices and Arrays are pretty self explanatory - they are what you would be used to
# in any mathmatical/data environemnt. A matrix is 2D and an array is nD, where each column
# has an equal row length - the only cavet is that all elements much be of the same data type.
matrix(1, nrow=4, ncol=4)
matrix(1:20, nrow=4)
matrix(1:20, ncol=4)
matrix(1:20, ncol=4, byrow=T)
as.matrix(my.integer)
as.matrix(c(my.integer, my.character))
# there are some more things to know about using and making matrices, but not that we need
# to cover now - the key thing to note is that they have a dim() attribute
dim(matrix(1:20, nrow=4))
dim(matrix(1:20, ncol=4))
# Data frames, though conceptually similar to a matrix, are actually a special type of list.
# The difference is that each element must be of the same length. This allows multiple different
# data types
my.df = data.frame(1:5, seq(from=0, to=1, by=0.25), c(TRUE,FALSE,TRUE,FALSE,TRUE),
c("one", "two", "three", "four", "five"))
my.df
data.frame(my.list) # looks the same?
# Can name the columns to make it easier for us
names(my.df) = c("int", "num", 'logi', "char")
my.df
str(my.df) # eek the character vector converted to factor, try stringsAsFactors=FALSE
# Of course we can use our names objects to make a data frame, and the names() are inherited
str(data.frame(my.integer, my.numeric, my.logical, my.character, stringsAsFactors=FALSE))
# Accessing elements of your objects --------------------------------------
# The way we access bit of our objects (subsetting) varies a bit depending on the data structure
# Get the first element of an atomic vector
my.integer[1]
# Get the first 3 elements of an atomic vector
my.integer[1:3]
# Get the first 3 elements of an atomic vector, and make a new object from it
my.integer.subset = my.integer[1:3]
str(my.integer.subset)
# Get the first element of a list
my.list[[1]]
# note the difference between
str(my.list[[1]])
str(my.list[1])
str(my.list[1:3])
# If we had named the list,
names(my.list) = c("int", "num", 'logi', "char")
# we can access via the name
my.list$int
my.list["num"]
# Since data frames are lists, we can access elements in the same way
my.df$int
my.df["num"]
# Since they also have a matrix like structure, we can index them as such
# Get the whole first row
my.df[1,]
# Get the whole first column
my.df[,1]
# Get the element in row 1, column 4
my.df[1,4]
# Get the whole first row by name
my.df[,"int"]
# try these ones
my.df[1:3,]
my.df[,1:3]
my.df[1:3,1:2]
my.df[,c(1,3)]
my.df[,c("int","char")]
my.df[c(TRUE,TRUE,TRUE,FALSE,FALSE),]
my.df[my.logical,]
# Functions ---------------------------------------------------------------
# Functions are also objects, not surprisingly. They make life easy
# We're not going to go into detail today, just the basics.
# The basic form (called a declaration or definition) of a function:
Function <- function(argument.1, argument.2, ...) {
# some use/analysis/manipulation using input arguments
return(what.you.want.as.output)
}
# When you ?query a function (e.g. ?lm) you get info on the arguments requires/accepted
# Arguments can be ordered or named
# Both are useful, and usually functions have a few ordered arguments and
# then maybe some more default argument values, e.g. lm()
data(beavers)
?lm
lm(temp ~ time, beaver1)
lm(formula=temp ~ time, data=beaver1)
lm(formula=temp ~ time, data=beaver1, method="qr")
# Also remember using stringsAsFactors=FALSE above
str(data.frame(my.integer, my.numeric, my.logical, my.character))
str(data.frame(my.integer, my.numeric, my.logical, my.character, stringsAsFactors=TRUE))
str(data.frame(my.integer, my.numeric, my.logical, my.character, stringsAsFactors=FALSE))
# Some useful functions
?str
?colnames
?row.names
?dim
?summary
?head
?hist
?plot
# Logical operators -------------------------------------------------------
# These are just like normal mathmatical logic
# Try:
1 == 1
1 == 2
1 > 0
1 > 2
1 != 1
1 != 2
1 == 1 & 2 == 2
1 == 1 & 2 == 3
1 == 1 | 2 == 3
TRUE > FALSE
sum(TRUE)
sum(my.logical)
my.df$int == 1
my.df$int == 0
my.df$int > 1
sum(my.df$int > 1)
# we could combine logcial operators and subsetting, read more about this if you like
my.df[my.df$int==1,]
my.df[my.df$int>1,]
my.df[my.df$char=="one",]
# read about "if statements" to make more functional use of logical operators, e.g.
if (TRUE) {
"DO SOMETHING"
} else {
"THIS HAPPENS"
}
if (FALSE) {
"DO SOMETHING"
} else {
"THIS HAPPENS"
}
# More... -----------------------------------------------------------------
# Google it
# Go here: http://adv-r.had.co.nz/
# Come to some more R User Group sessions on functions, looping etc.