-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataClubTutorial.R
More file actions
58 lines (45 loc) · 1.25 KB
/
DataClubTutorial.R
File metadata and controls
58 lines (45 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#Data club tutorial
install.packages('dplyr')
library(dplyr)
data = read.csv("~/Downloads/data.csv")
data_skip_rows = read.csv("~/Downloads/data.csv", sep = ",", skip = 100)
data_no_head = read.csv("~/Downloads/data.csv", sep = ",", header=FALSE)
# this one won't work and I'm not sure why
data_mod_index = read.csv("~/Downloads/data.csv", sep = ",", row.names = "radius_mean")
colnames(data)
head(data, n=10)
tail(data, n=10)
dim(data)
data$area_mean
data[100,]
# drop col method I like
data$compactness_mean <- NULL
colnames(data)
# drop row
data = data[-c(5),]
data = data[-c(5:10),]
data = data[,colSums(is.na(data)) == 0]
# July 13
# Import data
data = read.csv("~/Downloads/breast_cancer_subset.csv")
head(data)
subset_data = data
# Compute summary stats
summary(subset_data)
# Correlation values
cor(subset_data[,3:7])
# Check mean of a given column
mean(subset_data$radius)
# Check SD of a given column
sd(subset_data$radius)
# Check min and max of a given column
min(subset_data$radius)
max(subset_data$radius)
# Mean of every (numeric) column
sapply(subset_data[,3:7], mean)
# also
colMeans(subset_data[,3:7])
# Count number of elements in a specific column
length(subset_data$radius)
# Calculate sum of the values in a given column
sum(subset_data$radius)