Cross Validated Asked on January 3, 2022
After conducting a short survey, I have collected the results in the form of a dataframe. This is a reproducible version of what the actual data frame looks like.
library(dplyr)
library(tidyr)
df=data.frame(ID=c("1101","1102","1103","1104",
"1105","1106","1107","1108",
"1109","1110","1111","1112",
"1113","1114","1115","1116",
"1117","1118","1119","1120",
"1121","1122","1123","1124",
"1125","1126","1127","1128",
"1129","1130","1131","1132",
"1133","1134","1135","1136",
"1137","1138","1139","1140",
"1141","1142","1143","1144",
"1145","1146","1147","1148",
"1149","1150","1151","1152",
"1153","1154","1155","1156"),
Country=c("US","UK","Canada","Mexico",
"India","US","Peru","China",
"US","UK","Canada","Mexico",
"Portugal","India","Portugal","Mexico",
"Peru","India","Canada","Mexico",
"India","UK","India","Canada",
"US","UK","China","India",
"US","Mexico","Canada","Mexico",
"Canada","China","Canada","Canada",
"China","China","India","Mexico",
"Portugal","Portugal","Portugal","Portugal",
"UK","UK","UK","Peru",
"Peru","Mexico","US","US",
"Peru","Mexico","Peru","Mexico"),
Gender=c("Male","Male","Male","Female",
"Female","Female","Male","Female",
"Female","Female","Male","Female",
"Male","Male","Female","Female",
"Female","Male","Female","Female",
"Female","Female","Male","Female",
"Male","Female","Male","Female",
"Female","Male","Female","Female",
"Male","Male","Male","Female",
"Male","Male","Female","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Female","Male","Female",
"Male","Male","Male","Male"),
Age=c("<25","25-35","25-35","36-45",
">55",">55","25-35",">55",
"<25","25-35","25-35","36-45",
"25-35","25-35","25-35","36-45",
">55","36-45","46-55","36-45",
">55","46-55","25-35","46-55",
"<25","46-55","25-35","46-55",
"25-35","25-35","46-55","36-45",
"<25","<25",">55","36-45",
"36-45","46-55","<25","<25",
"<25",">55","36-45","46-55",
"<25",">55","36-45","46-55",
"36-45",">55","36-45","46-55",
"<25","46-55","<25","46-55"),
Score_Q1=c(4,4,3,2,
1,1,4,2,
1,1,1,2,
2,1,4,3,
4,3,1,1,
1,2,1,1,
1,4,1,4,
3,4,3,3,
1,3,3,1,
1,1,2,1,
1,2,1,2,
1,1,1,1,
2,2,2,2,
1,2,3,4),
Score_Q2=c(1,4,1,1,
1,2,1,1,
1,4,4,4,
2,1,1,3,
4,3,1,1,
1,3,3,3,
2,4,1,2,
4,4,4,4,
1,1,1,1,
1,2,3,4,
4,4,2,1,
1,2,3,2,
1,2,1,2,
4,3,2,1))
The survey is split into the following parts-
1) ID: A respodent ID
2) Country: Country of origin of respondent
3) Gender: The gender of the respondent
4) Age: The age of the respondent
5) Score_Q1: The satisfaction score for Q1, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
6) Score_Q2: The satisfaction score for Q2, on a scale from 1
(Very satisfied) to 4
(Very dissatisfied).
First, I convert the columns Age
, Gender
and Country
to factors
#convert to factor
df$Country=as.factor(df$Country)
df$Gender=as.factor(df$Gender)
df$Age=as.factor(df$Age)
Next, I check my data to see what the ratios by socio-demographic variables looks like –
I begin with Gender
, and see what the sample ratios are for gender by Country
#1) Gender by Country: Sample Ratio
split_gender=df %>% select(Country,Gender) %>%
group_by(Gender,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Gender,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_gender=data.frame(apply(split_gender, 2, as.numeric))
split_gender_sample=as.data.frame(sweep(split_gender,2,colSums(split_gender),`/`))
split_gender_sample[1,1]="Female"
split_gender_sample[2,1]="Male"
I do the same for Age
by Country
#2) Age by Country: Sample Ratio
split_age=df %>% select(Country,Age) %>%
group_by(Age,Country) %>%
summarise(n=n()) %>%
ungroup() %>%
select(Country,Age,n) %>%
group_by(Country,add=TRUE) %>%
spread(Country,n)
split_age=data.frame(apply(split_age, 2, as.numeric))
split_age[is.na(split_age)] <- 0
split_age_sample=as.data.frame(sweep(split_age,2,colSums(split_age),`/`))
split_age_sample[1,1]="<25"
split_age_sample[2,1]=">55"
split_age_sample[3,1]="25-35"
split_age_sample[4,1]="36-45"
split_age_sample[5,1]="46-55"
#Clean up unwanted dataframes
rm(list=c('split_age','split_gender'))
The above two steps give me two data frames – split_age_sample
& split_gender_sample
. These dataframes contain the sample ratios for age and gender by country for my 56 respondents.
My Objective: Calculating Sampling Weights Based on Gender & Age
In order to make my data frame more representative of reality, I would like to attribute weights to my respondents based on the official population ratios for age and gender by country.
Following some online research, these are the latest population ratios I found for the countries I surveyed.
#Gender by Country
split_gender_official=data.frame(Gender=c("Female","Male"),
Canada=c(0.4,0.6),
China=c(0.3,0.7),
India=c(0.3,0.7),
Mexico=c(0.5,0.5),
Peru=c(0.6,0.4),
Portugal=c(0.5,0.5),
UK=c(0.4,0.6),
US=c(0.4,0.6))
#Age by Country
split_age_official=data.frame(Age=c("<25",">55","25-35","36-45","46-55"),
Canada=c(0.1,0.3,0.3,0.2,0.1),
China=c(0.3,0.05,0.35,0.1,0.2),
India=c(0.5,0.05,0.35,0.05,0.05),
Mexico=c(0.2,0.3,0.2,0.1,0.2),
Peru=c(0.1,0.3,0.2,0.2,0.2),
Portugal=c(0.2,0.1,0.05,0.05,0.6),
UK=c(0.2,0.3,0.1,0.3,0.1),
US=c(0.2,0.3,0.1,0.3,0.1))
Desired Output
Based on my sample ratios and the offical population ratios for both age & gender, I’d like to attribute weights to my respondents, in a separate column called weights
.
Currently I am unable to figure out how to do this calculation.
Then, once the weights are calculated, I’d like to summarize the scores using the weights
column. The aggregation would look something like this (except with the weights included in the calculation) –
#Calculate weighted overall scores by Country & Gender: example UK
weighted_aggregated_scores_gender=df %>%
select(-Age) %>%
group_by(Country,Gender) %>%
filter(Country=='UK') %>%
summarise(Q1_KPI=round(sum(Score_Q1 %in% c(1,2)/n()),2),
Q2_KPI=round(sum(Score_Q2 %in% c(1,2)/n()),2))
I’d really appreciate any help I can get on the weight calculation and its usage in the aggregation step that follows.
You can use ?melt to convert your official ratios to a long format, and then ?merge with the original dataframe to get weights. summarizing is easily done with data.table
#Convert to long dataframe format
split_age_official <- melt(split_age_official, variable.name="Country", value.name="popratio.Age")
split_gender_official <- melt(split_gender_official , variable.name="Country", value.name="popratio.Gender")
# merge with original dataframe
df <- merge(df, split_age_official, by=c("Country", "Age"))
df <- merge(df, split_gender_official, by=c("Country", "Gender"))
# Calculated weighted scores
df$Weighted_Score_Q1.Gender <- df$Score_Q1 * df$popratio.Gender
df$Weighted_Score_Q1.Age <- df$Score_Q1 * df$popratio.Age
df$Weighted_Score_Q2.Gender <- df$Score_Q2 * df$popratio.Gender
df$Weighted_Score_Q2.Age <- df$Score_Q2 * df$popratio.Age
I usually prefer datatable for summarization, but you can use dplyr as well of course.
A data.table summarization would be something like:
library(data.table) # install.packages("data.table")
df <- data.table(df)
Weighted_Gender <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Gender, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Gender, na.rm=TRUE)), by=c("Country", "Gender")]
Weighted_Age <- df[,list(Q1_KPI=sum(Weighted_Score_Q1.Age, na.rm=TRUE), Q2_KPI=sum(Weighted_Score_Q2.Age, na.rm=TRUE)), by=c("Country", "Age")]
You can subset by the country you want, for example:
Weighted_Gender[Country=="UK"]
Answered by Knio on January 3, 2022
Get help from others!
Recent Answers
Recent Questions
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP