In-Class Exercise 4

Published

February 4, 2023

Modified

February 14, 2023

pacman:: p_load(plotly , DT , patchwork , ggstatsplot,  tidyverse)
exam_data = read_csv("data/Exam_data.csv")
Rows: 322 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ID, CLASS, GENDER, RACE
dbl (3): ENGLISH, MATHS, SCIENCE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plot_ly(data = exam_data,
        x = ~ENGLISH,
        y = ~MATHS,
        color = ~RACE)
No trace type specified:
  Based on info supplied, a 'scatter' trace seems appropriate.
  Read more about this trace type -> https://plotly.com/r/reference/#scatter
No scatter mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
p = ggplot(data=exam_data, 
            aes(x = MATHS,
                y = ENGLISH)) +
  geom_point(dotsize = 1) +
  coord_cartesian(xlim=c(-100,100),
                  ylim=c(-100,100))
Warning in geom_point(dotsize = 1): Ignoring unknown parameters: `dotsize`
ggplotly(p)
ggbetweenstats(
  data = exam_data,
  x = GENDER, 
  y = MATHS,
  type = "p",
  messages = FALSE
)

ggscatterstats(
  data = exam_data,
  x = MATHS, 
  y = ENGLISH,
  marginal = TRUE
)
Registered S3 method overwritten by 'ggside':
  method from   
  +.gg   ggplot2
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pacman::p_load(readxl, performance, parameters, see)
car_resale <- read_xls("data/ToyotaCorolla.xls", "data")
car_resale
# A tibble: 1,436 × 38
      Id Model       Price Age_0…¹ Mfg_M…² Mfg_Y…³     KM Quart…⁴ Weight Guara…⁵
   <dbl> <chr>       <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
 1    81 TOYOTA Cor… 18950      25       8    2002  20019     100   1180       3
 2     1 TOYOTA Cor… 13500      23      10    2002  46986     210   1165       3
 3     2 TOYOTA Cor… 13750      23      10    2002  72937     210   1165       3
 4     3  TOYOTA Co… 13950      24       9    2002  41711     210   1165       3
 5     4 TOYOTA Cor… 14950      26       7    2002  48000     210   1165       3
 6     5 TOYOTA Cor… 13750      30       3    2002  38500     210   1170       3
 7     6 TOYOTA Cor… 12950      32       1    2002  61000     210   1170       3
 8     7  TOYOTA Co… 16900      27       6    2002  94612     210   1245       3
 9     8 TOYOTA Cor… 18600      30       3    2002  75889     210   1245       3
10    44 TOYOTA Cor… 16950      27       6    2002 110404     234   1255       3
# … with 1,426 more rows, 28 more variables: HP_Bin <chr>, CC_bin <chr>,
#   Doors <dbl>, Gears <dbl>, Cylinders <dbl>, Fuel_Type <chr>, Color <chr>,
#   Met_Color <dbl>, Automatic <dbl>, Mfr_Guarantee <dbl>,
#   BOVAG_Guarantee <dbl>, ABS <dbl>, Airbag_1 <dbl>, Airbag_2 <dbl>,
#   Airco <dbl>, Automatic_airco <dbl>, Boardcomputer <dbl>, CD_Player <dbl>,
#   Central_Lock <dbl>, Powered_Windows <dbl>, Power_Steering <dbl>,
#   Radio <dbl>, Mistlamps <dbl>, Sport_Model <dbl>, Backseat_Divider <dbl>, …
model <- lm(Price ~ Age_08_04 + Mfg_Year + KM + 
              Weight + Guarantee_Period, data = car_resale)

Comment : can use gtsummary::tbl_regression to capture and the model and translate it into data table format. # {r} # table1 = tbl_regression(model1, intercept = TRUE) #

Comment: Check for multicollinearity by diagnostic test and visualize the results using check_collinearity() of the performance package. using VIF ( variance inflation factor)

It will tell you give 2 predictors that have low correlation and 2 predictors that have high correlation. after that we can even use plot to check the collinearity. Above 10 >= high correlated, below 3 >= no sign of multi collinearity.

check_collinearity(model)
# Check for Multicollinearity

Low Correlation

             Term   VIF     VIF 95% CI Increased SE Tolerance Tolerance 95% CI
 Guarantee_Period  1.04   [1.01, 1.17]         1.02      0.97     [0.86, 0.99]
        Age_08_04 31.07 [28.08, 34.38]         5.57      0.03     [0.03, 0.04]
         Mfg_Year 31.16 [28.16, 34.48]         5.58      0.03     [0.03, 0.04]

High Correlation

   Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
     KM 1.46 [1.37, 1.57]         1.21      0.68     [0.64, 0.73]
 Weight 1.41 [1.32, 1.51]         1.19      0.71     [0.66, 0.76]
check_c <- check_collinearity(model)
plot(check_c)
Variable `Component` is not in your data frame :/

Comment: Checking Normality of the model

model1 <- lm(Price ~ Age_08_04 + KM + 
              Weight + Guarantee_Period, data = car_resale)
check_n <- check_normality(model1)
plot(check_n)

check_model(model1)
Variable `Component` is not in your data frame :/

plot(parameters(model1))

ggcoefstats(model1, 
            output = "plot")

my_sum <- exam_data %>%
  group_by(RACE) %>%
  summarise(
    n=n(),
    mean=mean(MATHS),
    sd=sd(MATHS)
    ) %>%
  mutate(se=sd/sqrt(n-1))
my_sum
# A tibble: 4 × 5
  RACE        n  mean    sd    se
  <chr>   <int> <dbl> <dbl> <dbl>
1 Chinese   193  76.5  15.7  1.13
2 Indian     12  60.7  23.4  7.04
3 Malay     108  57.4  21.1  2.04
4 Others      9  69.7  10.7  3.79
ggplot(my_sum) +
  geom_errorbar(
    aes(x=RACE, 
        ymin=mean-se, 
        ymax=mean+se), 
    width=0.2, 
    colour="black", 
    alpha=0.9, 
    size=0.5) + 
  
  geom_point(aes
           (x=RACE, 
            y=mean), 
           stat="identity", 
           color="red",
           size = 1.5,
           alpha=1) +
  ggtitle("Standard error of mean 
          maths score by rac")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

{r} # my_sum %>% # ggplot(aes(x = RACE, # y = MATHS)) + # stat_pointinterval() + # labs( # title = "Visualising confidence intervals of mean math score", # subtitle = "Mean Point + Multiple-interval plot") #