Sei sulla pagina 1di 3

How to read csv

columns type
data format
change coloumn name
Adding new column

finding mean, median for one column

creating summary table

# Clear workspace

rm(list = ls())

# Load libraries

suppressPackageStartupMessages(library(readr)) # For read_csv(...) below. You may


need additional libraries
library(dplyr) # For mutate(...) below. You may need additional libraries
library(ggplot2)
library(reshape2)
library(scales)
library(choroplethr)
library(choroplethrMaps)
library(RCurl)
library(WikipediR)

**************************************************
my_col_types <- cols(
Title = col_character(),
Studio = readr::col_factor(), # "readr::" to avoid conflict with scales
Release = col_date(format = "%m/%d/%Y"),
Screens = col_integer(),
Gross = col_integer(),
RealGross = col_double(),
Sample_column=col_skip()
)
cols_only (
required_col1=col_character()
)

df <- read_csv("movies.csv", col_types = my_col_types, na=c("NA", ""))


**************************************************************

colnames(df)[colnames(df)=="old name"] <- "new name"


colnames(df)[colnames(df)=="Clinton"] <- "value"

********************************************************************
df$newcol <- df$oldcol*10

df$newcol1 <- c("Yes")

*********************************************

my_mean<- mean(df$col2,na.rm=TRUE) # by removing missing values

********************************************
df%>%group_by(col1)%>%summarize(col2mean=mean(col2,na.rm=TRUE))

***************************************

summarize(col2count=n())

***************************************

df <- mutate(df, newcolumnname = oldcol*100 )

*******************************************************

To find the number of columns in a dataframe:


ncol(df)

*******************************************************************************

Handy command to change format of data.


df$DateTime<-as.POSIXct(df$DateTime, "%m/%d/%y %H:%M", tz =
"America/Chicago")

***********************************************************************************
*************

How to find a logical vector? Logical vectors are TRUE or FALSE values in a vector
type data. For Example: Use the == comparison to create a logical vector that shows
which levels in levels(df$Shape) equal "Disk". Store the result in a variable
called wh.
wh <- c( levels(df$Shape)=="Disk" )

***********************************************************************************
****

To find the rows of df that don�t have NA in any column.


df <- subset(df,complete.cases(df))

***************************************************************************

Sorting the dataframe records in ascending and descending order.


df <- arrange(df, ccDebt)
df <- arrange(df, desc(ccDebt))

Filter dataframe for conditions:


OR condition: filter(df, yearsEmploy == 0, ccDebt >= 10000)
AND condition: filter(df, (yearsEmploy == 0 & ccDebt >= 10000))

Use mutate to add a new column.


df <- mutate(df, cScore_binned = value for new column)

Group and find the number of records per group: Here the group is by yearsEmploy
and default column of dataframe df.
group_by(df, yearsEmploy, default) %>% summarize(count = n())

Group and find the mean of a column per group: Here the group is by yearsEmploy
and default column of dataframe df.
group_by(df, yearsEmploy, default) %>% summarize(Meanscore = mean(Score))

Group and find the median of a column per group: Here the group is by yearsEmploy
and default column of dataframe df.
group_by(df, yearsEmploy, default) %>% summarize(Meanscore =
median(Score))

To find the sum of a column, we cannot use summarize. We have to use aggregate
function.
aggregate(df$Frequency, by=list(Category=df$Category), FUN=sum)
Answer would be like below:
Category x
1 First 30
2 Second 5
3 Third 34

Later we can rename the column name from x to something meaningful using colnames
or names function.

***********************************************************************************
*****************************************
# Clear workspace

rm(list = ls())

# Load libraries
suppressPackageStartupMessages(library(readr) )
suppressPackageStartupMessages(library(dplyr) )
suppressPackageStartupMessages(library(ggplot2) )
suppressPackageStartupMessages(library(reshape2) )
suppressPackageStartupMessages(library(scales) )
suppressPackageStartupMessages(library(choroplethr) )
suppressPackageStartupMessages(library(choroplethrMaps))
suppressPackageStartupMessages(library(RCurl) )
suppressPackageStartupMessages(library(WikipediR) )
suppressPackageStartupMessages(library(rvest) )
suppressPackageStartupMessages(library(maps) )
suppressPackageStartupMessages(library(ggmap) )
suppressPackageStartupMessages(library(DBI) )
suppressPackageStartupMessages(library(RSQLite) )

**********************************************************************

# Read the second sheet in the Excel file


df <- read_excel("example.xlsx", sheet = 2)
head(df)

******************************************
#Downloading files from Web
url <- "https://data.cityofchicago.org/api/views/xzkq-xp2w/rows.csv"
download.file(url, "data.csv")
df <- readr::read_csv("data.csv")

Potrebbero piacerti anche