R Tutorial – R Basic Syntax ‎R Overview

R Tutorial – R Basic Syntax ‎R Overview

R is a free software programming language and software environment for statistical computing and graphics. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.



What R can do ?

Using R Programming Language one can do

  • linear and non-linear modelling
  • statistical tests
  • Time-series analysis
  • Classification
  • Clustering



Why R ?

Why to use R Programming Language and the answer is because

  • R is Free.
  • It Provides a powerful way to do statistical analysis on large sets of data.
  • New functions and packages are created and updated consistently.
  • It has Strong user base.



R Tutorial

# to load a file
	library(dslabs) 		# dslabs is a package
	data(file_name)
		# imports file_name in R
	str(file_name) 		# gives the structure of imported/loaded data set
# combination or concat
	code <- c(380, 124, 818)
	code
	country <- c('italy', 'canada', 'egypt')
	country
# names gives the column names of a data set
	names(code) <- country 	# assigning names to code list
	code
# basic function
	class(country)
length(country)
# numeric vs integer
	num1 <- 3
	num2 <- 3L
	class(num1)
	class(num2)
# sequence generation
	li1 <- seq(10,20,2)
	li2 <- 20:10
	li1
	li2
# subsetting and slicing
	code[2]
country[c(1,3)]
	country[2:3]
# type casting and chage to numeric or integer using as.numeric
x <- c('1','2', 10)
	x
class(x)
as.numeric(x)
x
# sorting, arranges in ascending order
	temp <- c(2,50,56,87,101)
	sort(temp)
	temp
# order, returns indexes of the ordered elements
	ord <- c(34,56,38,887,101)
	index <- order(ord)
	index
	class(index)
	ord
	ord[index]
# max and whichmax, whichmax gives the index of greatest, SAME FOR MIN
	max(ord)
	which.max(ord)
	ord[which.max(ord)]
# rank, gives rank from smalles to greatest
rank(ord)
# create dataframe
	state <- c('UP','Gujarat','Bihar','J&K')
	pop <- c(10,20,NA,39)
	df1 <- data.frame(states = state, population = pop)
	df1
# check for NA values i.e. null values
	ind <- is.na(df1) 	# gives a logical vector i.e. a boolean df
	ind
sum(ind)		# counts the number of NA values by summing TRUE in ind as TRUE=1
mean(df1[!ind]) 	# gives mean of values in df1 which are not NA
# logical operators
	#    ind1 <- df$column_name <= 7	# gives a boolean df or vector similar to is.na
	#    df$column_name1[ind1]		# gives the values from dataframe satisfying condition
# WHICH, MATCH and %IN% functions, they all give out the index numbers of the elements satisfying condition
	index <- which(df$column == "Brad")
	df[index]
	index <- match(c("NY","Florida","Texas"), df$column)
	df[index]
	x<- c("a","b","c","d")
	y<-c("a","b","e")
	y %in% x 			#gives a boolean output
	c("boston","dakota","washington") %in% df$column_name # checks if three items are in column_name or not and returns boolean
	ind <- which(!abbs %in% murders$abb)
# manipulating data tables and advanced analysis can be done using the 'DPLYR' package(for working with tables)!
	df1 <- mutate(df1,col4=........)	# adds column col4 to df1. mutate is used to add columns
	head(df1)			# prints first 6 rows of df1
	filter(df1, rate <=0.7)		# filter prints rows satisfying condition given from df1
	new_table <- select(df1, col1,col2)	# only selects col1 and col2 from df1 and makes new data frame as new_table
	df1 %>% select(col2, col3, col4) %>% filter(col4 <= 0.7)	# pipe operator can combine diffrent conditions into one
	filter(murders, rate < 1 & (region == 'Northeast' | region == 'West')) %>% select(state, rate, rank)
# creating data frame
	grades = data.frame(name = c("aug","july","june"),
			exam = c(95, 96, 97),
			exam2 = c(10, 20, 30),
			stringsasfactors = FALSE)	# this makes columns type as character
	class(gades$name) 	# by defaulr column type are 'factor', to make them string we use 'stringsasfactors=FALSE'
	filter(grades, name != 'july')	# prints data frame without july as name
	murders_nw <- filter(murders, region %in% c("Northeast", "West")) 	# print rows with region northeast and west
	filter(murders, population < 5000000 & region == "Northeast")		# another way of multiple condition
	my_states <- filter(murders, rate < 1 & (region == 'Northeast' | region == 'West'))
# rank function
	x <- c(88, 100, 83, 92, 94)
	rank(x)/(-x)	# gives rank of elements from lowest to highest. for highest to lowest use '-'
# nrow()
	nrow(df1)		# counts number of rows
# Plots. These are built in R plotting functions. Most popular package for plotting is ggplot.
	plot(x, y)		# makes a scatter plot
	hist(column_name)
	boxplot(col1~col2, data = df1)	# creates a box plot comparing col1 and col2. col2 is according to which we are stratifying
# IF loop
	if(boolean expression){
		expression
	} else{
		expression
	}
# ifelse
	no_nas <- ifelse(is.na(na_example), 0, na_example)	# first is condition, then TRUE expression, then FALSE expression in one line
	sum(is.na(no_nas))	# confirms there are no more NAs in no_nas object
# ANY and ALL
	z <- c(true, false, false)
	any(z)	# will give TRUE. ANY takes a logical vector input and returns TRUE if any one entry is TRUE
	all(z)	# returns FALSE. ALL returns TRUE if all elements are TRUE and otherwise FALSE
# defining functions
	avg <- function(x){	# defines function avg. Can also be function(x,y,z)
	s<- sum(x)	# objects declared in a function are not saved in the workspace but created and changed only during the call
	n <- length(x)
	s/n
	}
# FOR loop
	for(i in 1:5){
	print(i)
	}
# functions used instead of FOR - apply, sapply, tapply, mapply
# defining a vector(list) variable
	a <- vector(length = 25)
# creating LIST. it can contain characters, numbers, vectors and matrix. can contain different data types
	list_name <- list("red","green", c(1,2,3), TRUE, 51.23)
	list_name <- list(c("JAN","FEB","MAR"), matrix(c(3,4,5,1,-2,5), nrow=2))
	names(list_name) <- c("Quarter", "A_Matrix")	# gives name to the elements in list list_names i.e kind of column names
	unlist(list_name) 	# this unlist function converts list object into a vector object
# creating MATRIX. Define rownames and colnames before as lists
	matrix(c(3,4,5,1,-2,5), nrow=2, byrow=TRUE/FALSE, dimnames = list(rownames, colnames))
	print(P[1,3])	# Access the element at 3rd column and 1st row
	print(P[2,])	# Access only the  2nd row
	print(P[,3])	# Access only the 3rd column
# Working directory
	> getwd()			#gives current working directory
# Merge and Join
	> merge(df1, df2, by = "col_name")		 # (INNER JOIN)mergers df1 and df2 on the given column
	> merge(df1, df2, by = "col_name", all.x = TRUE) # (Left Join)
	> merge(df1, df2, by = "col_name", all.y = TRUE) # (Right Join)
	> merge(df1, df2, by = "col_name", all = TRUE)	 # (Full Join)
# R also has IF, IF ELSE, SWITCH, REPEAT, WHILE, FOR
	> v <- "hello"
	> cnt <- 2
	> repeat{
	  print(v)
	  cnt<-cnt+1
	  if(cnt>1){
	  break
	  }
	  }
	> v <- "hello"
	> cnt <- 2
	> while(cnt<7){
	  print(v)
	  }
# Functions
	> new.function <- function(a){
	  for (i in 1:a) {
	  b<-i^2
	  print(b)
	  }
	  }
# R can read json files using RJSON package`
	> install.packages("rjson")
	> library("rjson")	# loads the package into R after install
	> new_file <- fromJSON(file = "file_name.json") # importing json file into a R object.
# reading a CSV file into R
	> new_file <= read.csv("file_name.csv") 	# this file should be in current working directory
	> data <- subset(df_name, col_name and condition) # subsets the df with the condition on columns
	> data <- subset(df1, salary == max(salary)) 	# prints data for max salary in df1
	> data <- subset(df1, salary >600 & dept == "IT")
	> wrinte.csv(df1, "output.csv", row.names = FALSE)# writes a new csv file into CWD
# to read XML files
	> library("XML")	# required library
	> library("methods")	# required library
	> new_file <- xmlParse(file = "file_name.xml")
	> rootnode <- xmlRoot(new_file)	# extracts the root node from imported xml file
	> print(rootnode[1])	# prints the data from first node
	> print(rootnode[[1]][[1]])	# first element of first node
	> rootsize <- xmlSize(new_file)	# finds number of nodes in the root
	> xmldf <- xmlToDataFrame("file_name.xml")	# converts xml file into dataframe
# RODBC is used to read database from R
	> install.packages("RODBC")
	> library("RODBC")
	> SQL.df <- odbcConnect("sqlserverodbc", uid="", pwd="");
# MEAN
	> x <- c(1,2.6,17.5,-21,54,18,4)
	> output <- mean(x)
	> output <- mean(x, trim = 0.3)	# removes 3 values from both ends after sorting the series in ascending order'
	> output <- mean(x, na.rm = TRUE) # removes all the 'NA' in series and then calculates MEAN
# MEDIAN
	> x <- c(2,5,21.5,9,78)
	> output <- median(x)	# gives median of the series
# MODE. No inbuilt function. user defined function has to be made
	> data.mode <- function(v) {
	  uniqv <- unique(v)
	  uniqv[which.max(tabulate(match(v, uniqv)))]
	  }
# Linear Regression model can be done by using ln() function (gives the coefficients)
# glm() is General Linear model used to run any linear model like logistic regression
# Summary
	> print(summary(df_name/dataset_name))
# Analysis of variance and ANOVA
	> result <- aov(formula_used, data = dataset_name)
	> print(anova(result))
# CHI SQUARE Test (MASS library has to be loaded)
	> library("MASS")
	> print(chisq.test(dataset_name))
# Multiple Regression
	- first do linear regression using ln. this will give coefficients(intercept)
	- individually itercept can be prnted using coef()
		> coef(model)[1] (model is the liner regression one using ln())
# Dicision Tree. Uses Library 'party'
	> library("party")
	> input.dat <- data_set[c(1:110),] # input the data into new data set/data frame
	> png(file = "dicision tree.png")  # gives chart file a name
	# creatre the tree
	> output.tree <- ctree {
	> nativespeaker ~ age * shoesize * score,
	> data = input.dat}
	> plot(output.tree) # this has to be saved using another command "sav.off". is saved in working directory as PNG format

Credit : Bhawill Panchal

3 Replies to “R Tutorial – R Basic Syntax ‎R Overview

  1. Tremendous things here. I am very glad to look your article.
    Thanks so much and I am looking forward to contact you.
    Will you please drop me a mail?

  2. Hi there colleagues, how is all, and what you want to say regarding this paragraph, in my view its truly amazing for me.

Comments are closed.