I'm trying to plot a series of demographics factors. Each plot show the frequency distributions of demographic variables by gender. It runs nicely, but some of the labels are ordered in alphabetic order and not in meaningful order eg. Education, Marital Status and SIC2007.

Data structure

structure(list(DMSex = c("Male", "Female", "Male", "Male"), Income = c(980, 
-8, 3000, 120), IncCat = c("-1", "-8", "-1", "-1"), HrWkAc = c(-1, 
-1, -1, -1), ShiftWk = c(-1, -1, -1, -1), ShiftPat = c(-1, -1, 
-1, -1), SOC2010C = c("-1", "9.2.3.3", "-1", "-1"), XSOC2010 = c(-1, 
9233, -1, -1), IndexNo = c(-1, 1398, -1, -1), ES2010 = c(-1, 
7, -1, -1), nssec = c(-1, 13.4, -1, -1), SECFlag = c(-1, 0, -1, 
-1), LSOC2000 = c("-1", "9.2.3.3", "-1", "-1"), XSOC2000 = c(-1, 
9233, -1, -1), seg = c(-1, 11, -1, -1), sc = c(-1, 5, -1, -1), 
    SIC2007 = c(-1, 87, -1, -1), Educ = c(1, 1, -1, 2), EducCur = c(10, 
    1, -1, -1), FinFTEd = c(-1, -1, -1, 1), FinFTEdY = c(-1, 
    -1, -1, 21), HiQual = c(22, 10, -1, 1), sic20070 = c(-1, 
    87, -1, -1), dhhtype = c(6, 8, 7, 3), dagegrp = c(2, 3, 3, 
    3), dmarsta = c("Single, never married", "Single, never married", 
    "Interview not achieved", "Married/cohabitating"), dhiqual = c(" Secondary", 
    " A level or equivalent", "Item not applicable", "Degree or higher"
    ), dnssec8 = c(-1, 8, -1, -1), duresmc = c(14, 15, 11, 16
    ), dgorpaf = c(7, 8, 5, 10), dukcntr = c(1, 1, 1, 1), dnrkid04 = c(0, 
    0, 0, 0), dilodefr = c(3, 3, -1, 3), deconact = c(8, 8, -1, 
    11), dtenure = c(2, 3, 2, 3), dtotac = c(-1, -1, -1, -1), 
    dtotus = c(-1, -1, -1, -1), dsic = c("Item not applicable", 
    "Public admin, education and health", "Item not applicable", 
    "Item not applicable"), dsoc = c(-1, 9, -1, -1), DVAge_category = c("15 to 30", 
    "15 to 30", "15 to 30", "15 to 30"), Income_category = c("Less than 1000", 
    "Less than 1000", "1001 to 3000", "Less than 1000"), HoursWorked_category = c("Less than 20 hours", 
    "Less than 20 hours", "Less than 20 hours", "Less than 20 hours"
    )), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))

#Age variable

demographics$dagegrp_category<-ifelse(demographics$dagegrp_01 > 2 & demographics$dagegrp < 6, age<-"15 to 30",
                             ifelse(demographics$dagegrp> 6 & demographics$dagegrp < 9, age<-"31 to 45",                          
                             ifelse(demographics$dagegrp > 9 & demographics$dagegrp < 12 , age<-"46 to 60",
                             ifelse(demographics$dagegrp > 12 & demographics$dagegrp < 15 , age<-"61 to 75",
                             ifelse(demographics$dagegrp > 15 & demographics$dagegrp < 18 , age<-"76+",
                                             age<- "zombie")))))

demographics$DVAge_category<-c("15 to 30","31 to 45", "46 to 60","61 to 75", "76+", "zombie")[findInterval(demographics$dagegrp  , c(-Inf, 6, 10, 12, 15,18, Inf))]
Age<-as.vector(demographics$DVAge_category)

#Gender variable

demographics$DMSex[demographics$DMSex==1]<-"Male"
demographics$DMSex[demographics$DMSex==2]<-"Female"

Gender<-as.vector(demographics$DMSex)

#Income variable

demographics$Income_category<-ifelse(demographics$Income < 1001, income<-"Less than 1000",
                              ifelse(demographics$Income > 999 & demographics$Income < 3001, income<-"1001 to 3000",
                              ifelse(demographics$Income  > 3001 & demographics$Income < 6001, income <-"3001 to 6000",                          
                              ifelse(demographics$Income > 6001 & demographics$Income < 10001 , income<-"6001 to 10000",
                              income<- "zombie")))) 

demographics$Income_category<-c("Less than 1000","1001 to 3000", "3001 to 6000", "6001 to 10000","zombie")[findInterval(demographics$Income , c(-Inf, 1001, 3001, 6001,10001, Inf) ) ]

Income<-as.vector(demographics$Income_category)

#Marital status variable

demographics$dmarsta[demographics$dmarsta==-1]<-"Interview not achieved"
demographics$dmarsta[demographics$dmarsta==1]<-"Single, never married"
demographics$dmarsta[demographics$dmarsta==2]<-"Married/cohabitating"
demographics$dmarsta[demographics$dmarsta==3]<-"Divorced/widowed"

MaritalStatus<-as.vector(demographics$dmarsta)

#Education

demographics$dhiqual[demographics$dhiqual==-8]<-"Don't know"
demographics$dhiqual[demographics$dhiqual==-1]<-"Item not applicable"
demographics$dhiqual[demographics$dhiqual==1]<-"Degree or higher"
demographics$dhiqual[demographics$dhiqual==2]<-"Higher education"
demographics$dhiqual[demographics$dhiqual==3]<-" A level or equivalent"
demographics$dhiqual[demographics$dhiqual==4]<-" Secondary"
demographics$dhiqual[demographics$dhiqual==5]<-" Other"

Education<-as.vector(demographics$dhiqual)


#Hours worked per week in main job variable

demographics$HoursWorked_category<-ifelse(demographics$dtotac < 21, workhours<-"Less than 20 hours",
                                   ifelse(demographics$dtotac > 20 & demographics$dtotac< 41, workhours <-"Between 21 to 40 hours",
                                   ifelse(demographics$dtotac > 40 & demographics$dtotac < 61, workhours <-"Between 41 to 60 hours",                          
                                   ifelse(demographics$dtotac > 62, workhours<-"More than 61 hours",
                                     workhours<- "Not Applicable")))) 


demographics$HoursWorked_category<-c("Less than 20 hours", "Between 21 to 40 hours", "Between 41 to 60 hours","More than 61 hours","Not Applicable")[findInterval(demographics$dtotac, c(-Inf, 21, 41, 61, 62, Inf) ) ]

WorkHours<-as.vector(demographics$HoursWorked_category)

#DV: SIC 2007 industry divisions (grouped)

demographics$dsic[demographics$dsic==-8]<-"Don't know"
demographics$dsic[demographics$dsic==-1]<-"Item not applicable"
demographics$dsic[demographics$dsic==1]<-"Agriculture, forestry and fishing"
demographics$dsic[demographics$dsic==2]<-"Manufacturing"
demographics$dsic[demographics$dsic==3]<-"Energy and water supply"
demographics$dsic[demographics$dsic==4]<-"Construction"
demographics$dsic[demographics$dsic==5]<-"Distribution, hotels and restaurants"
demographics$dsic[demographics$dsic==6]<-"Transport and communication"
demographics$dsic[demographics$dsic==7]<-"Banking and finances"
demographics$dsic[demographics$dsic==8]<-"Public admin, education and health"
demographics$dsic[demographics$dsic==9]<-"Other services"

demographics$industry_category<-c("Don't know", "Item not applicable", "Agriculture, forestry and fishing","Manufacturing","Energy and water supply",
                                  "Construction", "Distribution, hotels and restaurants", "Transport and communication", "Banking and finances",
                                  "Public admin, education and health", "Other service")

SIC2007<-as.vector(demographics$dsic)


# creating df

df<-data.frame(Gender, Age, Education, MaritalStatus, Income, WorkHours, SIC2007)


 df %>% 

#tidy, not gender

gather(variable, value, -c(Gender))%>%

#group by value, variable, then gender

group_by(value, variable, Gender)  %>%

#summarise to obtain table cell frequencies

summarise(freq=n()) %>%

#Plot

  ggplot(aes(x=value, y=freq, group=Gender))+geom_bar(aes(fill=Gender),  stat='identity', position='dodge')+  facet_wrap(~variable, scales='free_x') + theme(legend.position="right", axis.text.x = element_text(angle = 60, hjust = 1)) + labs(x="Characteristics", y="Frequencies") 

Output

1 Answers

1
Jurr.Ian On Best Solutions

In ggplot2, the data is ordered according to the factor levels of the data.frame column. To (re)set the order in your plot, just set the order of the factor by:

df$variable <- factor(df$variable, levels = c(...))

You could do this by first storing the data.frame, before piping to the ggplot function, then manually setting the levels of the variables you want to change. It is maybe a bit inefficient, but this should do the trick:

## Make your plotting data.frame
df2 <- df %>% 
gather(variable, value, -c(Gender))%>%
group_by(value, variable, Gender) %>%
summarise(freq=n())

## Apply custom order to MaritalStatus variable:
custom <- c(sort(unique(MaritalStatus))[c(4,3,1,2)], 
            ....)
df2$variable <- factor(df2$variable, levels = c(levels(df2$variable)[!levels(df2$variable) %in% custom], 
custom))