-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdl_gss_spss.R
176 lines (130 loc) · 5.73 KB
/
dl_gss_spss.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# analyze survey data for free (http://asdfree.com) with the r language
# general social survey
# 1972-2012 cross-sectional cumulative data (release 1, march 2013)
#
#
# if you have never used the r language before,
# watch this two minute video i made outlining
# how to run this script from start to finish
# http://www.screenr.com/Zpd8
# anthony joseph damico
# if you use this script for a project, please send me a note
# it's always nice to hear about how people are using this stuff
# for further reading on cross-package comparisons, see:
# http://journal.r-project.org/archive/2009-2/RJournal_2009-2_Damico.pdf
##################################################################################################################
# Analyze the 1972-2012 General Social Survey cross-sectional cumulative data (release 6, July 2014) file with R #
##################################################################################################################
# this is memory intensive so it clears the workspace first save anything you want to keep
rm(list=ls())
# set your working directory.
# all GSS data files will be stored here
# after downloading and importing.
# use forward slashes instead of back slashes
# uncomment this line by removing the `#` at the front..
# setwd( "C:/My Directory/GSS/" )
# ..in order to set your current working directory
# set the number of digits shown in all output
options( digits = 8 )
# remove the # in order to run this install.packages line only once
install.packages("downloader")
install.packages("foreign")
#install.packages( "survey" )
library(downloader)
library(foreign) # load foreign package (converts data files into R)
#library(survey) # load survey package (analyzes complex design surveys)
# by default, R will crash if a primary sampling unit (psu) has a single observation
# set R to produce conservative standard errors instead of crashing
# http://r-survey.r-forge.r-project.org/survey/exmample-lonely.html
# by uncommenting this line:
# options( survey.lonely.psu = "adjust" )
# this setting matches the MISSUNIT option in SUDAAN
###############################################
# DATA LOADING COMPONENT - ONLY RUN THIS ONCE #
###############################################
# create new character variables containing the full filepath of the file on norc's website
# that needs to be downloaded and imported into r for analysis
GSS.2012.CS.file.location <-
"http://publicdata.norc.org/GSS/DOCUMENTS/OTHR/GSS_spss.zip"
# create a temporary file and a temporary directory
# for downloading file to the local drive
tf <- tempfile() ; td <- tempdir()
# download the file using the filepath specified
download.file(
# download the file stored in the location designated above
GSS.2012.CS.file.location ,
# save the file as the temporary file assigned above
tf ,
# download this as a binary file type
mode = "wb"
)
# the variable 'tf' now contains the full file path on the local computer to the specified file
# store the file path on the local disk to the extracted file (previously inside the zipped file)
# inside a new character string object 'fn'
fn <-
unzip(
# unzip the contents of the temporary file
tf ,
# ..into the the temporary directory (also assigned above)
exdir = td ,
# overwrite the contents of the temporary directory
# in case there's anything already in there
overwrite = T
)
# print the temporary location of the spss (.sav) file to the screen
print( fn[ grep( "sav$" , fn ) ] )
# these two steps take a while. but once saved as a .rda, future loading becomes fast forever after #
# convert the spss (.sav) file saved on the local disk (at 'fn') into an r data frame
GSS.2012.CS.df <-
read.spss(
fn[ grep( "sav$" , fn ) ] ,
to.data.frame = TRUE ,
use.value.labels = FALSE
)
# copy to a different object
z <- GSS.2012.CS.df
# remove the original from RAM
rm( GSS.2012.CS.df )
# clear up memory
gc()
# repeat
GSS.2012.CS.df <- z
# repeat
rm( z )
# i have no idea why this works.
gc()
# but if you don't do this on a 3gb ram machine
# you will run out of memory. go figure.
# save the cross-sectional cumulative gss r data frame inside an r data file (.rda)
save( GSS.2012.CS.df , file = "GSS.2012.CS.rda" )
# note that this .rda file will be stored in the local directory specified
# with the setwd command at the beginning of the script
##########################################################################
# END OF DATA LOADING COMPONENT - DO NOT RUN DATA LOADING COMMANDS AGAIN #
##########################################################################
#Create Working Sets
#complete set is GSS.2012.CS.rda, created above
#second set has 2008-2012 and includes all variables is gss2008_12_all.rda
#third set has just religion and demo variables is gss2008_12.rda
unique(GSS.2012.CS.df$year)
#unique(gss2010_12$year)
gss2008_12<-subset(GSS.2012.CS.df, year > 2007)
save(gss2008_12, file="gss2008_12_all.rda")
rm(GSS.2012.CS.df) #clean up memory, this is huge and we won't need it again
#gss2008_12<-GSS.2012.CS.df
names(gss2008_12)
relnames<-grep("reli",names(gss2008_12))
names(gss2008_12[relnames])
names(gss2008_12[grep("premars",names(gss2008_12))])
names(gss2008_12[,4588:4675])
gss2008_12.religionA<-gss2008_12[c("year","region","age","reborn", "attend","attend12","cohort")]
gss2008_12.religionB<-gss2008_12[relnames]
gss2008_12.religionC<-gss2008_12[4588:4675]
gss2008_12.relcombine<-data.frame(gss2008_12.religionA,gss2008_12.religionB, gss2008_12.religionC)
#convert all names to uppercase
names(gss2008_12.relcombine)<-toupper(names(gss2008_12.relcombine))
head(gss2008_12.relcombine)
nrow(gss2008_12.relcombine)
rm(gss2008_12) # clean up memory before last save
save( gss2008_12.relcombine , file = "gss2008_12.rda" )