This is an R Notebook which reproduces the analysis, tables and figures of the working paper "MetaMetaZipf. What do analyses of city size distributions have in common?", as of August 2020. The data is stored on the MetaZipf GitHub repository.
refs <- read.csv2("data/zipf_refs.csv", sep=';')
cites <- read.csv2("data/zipf_cites.csv", sep=',')
data <- read.csv2("data/zipf_meta.csv", sep=",", dec=".")
cname <- "data/FullText/"
docs <- Corpus(DirSource(cname))
J2D <- read.csv2("data/journals2Disciplines.csv", sep=';')
cites_out <- cites[67:1221,]
cites_out$YEAR <- as.numeric(gsub("\\D+", "", cites_out$REFID))
cites_out$JOURNAL <- gsub(".*\\d.", "", cites_out$REFID)
cites_out$AUTHOR <- gsub("\\d.*", "", cites_out$REFID)
cites_out$JOURNAL <- gsub("[[:punct:]]", " ", cites_out$JOURNAL)
cites_out$AUTHOR <- gsub("[[:punct:]]", " ", cites_out$AUTHOR)
cites_out <- data.frame(cites_out,J2D[match(cites_out$JOURNAL, J2D$JOURNAL),])
cites_out$DISCIPLINE <- cites_out$DISCPLINE
cites_out$DISCPLINE <- NULL
cites_out$JOURNAL.1 <- NULL
cites <- rbind(cites[1:66,],cites_out)
cites[] <- 0
cites_within <- cites[1:66,c(1,6:71)]
j <- cites[1:66,1:5]
j$n <- 1
journals <- aggregate(j[,"n"], by=list(j[,"JOURNAL"]), FUN = sum)
disciplines <- aggregate(j[,"n"], by=list(j[,"DISCIPLINE"]), FUN = sum)
q <- ggplot(journals, aes(x= Group.1, y = x))
q + geom_lollipop(aes(reorder( Group.1, -x)),color = "orangered", cex=1) +
coord_flip() +
labs(x="Journal", y="Number of articles in the corpus")
rownames(cites_within) <- cites_within$REFID
cites_within$REFID <- NULL
cmat <- t(as.matrix(cites_within))
c <- colSums(cmat)
co <- c[order(-c)]
ref <- names(co)
cope <- data.frame(ref, co)
cope_full_names <- data.frame(cope, cites[match(cope$ref, cites$REFID),c("AUTHOR", "YEAR", "JOURNAL")])
cope_full_names$ref <- paste(cope_full_names$AUTHOR, cope_full_names$YEAR, sep=" ")
par(mar = c(2,2,2,2))
q <- ggplot(cope_full_names, aes(x=ref, y = co))
q + geom_lollipop(aes(reorder(ref, -co)),color = "seagreen3", cex=1) +
coord_flip() +
labs(x="Reference", y="Number of in-citations from the corpus")
age_journals <- data.frame(aggregate(j[,"YEAR"], by=list(j[,"JOURNAL"]), FUN = mean), journals$x)
age_disciplines <- data.frame(aggregate(j[,"YEAR"], by=list(j[,"DISCIPLINE"]), FUN = mean), disciplines$x)
rownames(age_disciplines) = age_disciplines$Group.1
rownames(age_journals) = age_journals$Group.1
age_disciplines$Group.1 <- NULL
age_journals$Group.1 <- NULL
colnames(age_disciplines) <- c("mean age", "n articles")
colnames(age_journals) <- c("mean age", "n articles")
round(mean(j$YEAR), digit=0)
[1] 2004
round(age_disciplines, digit=0)
corpussummary <- data.frame(j, cope[match(j$REFID, cope$ref),])
colnames(corpussummary)[8] <- "in_citations"
mean(corpussummary[corpussummary$in_citations > 10, "YEAR"])
[1] 1998.3
cites_out <- cites[67:1221,]
cites_out$n_cites <- rowSums(cites_out[,6:71])
single_outcites <- cites_out[order(-cites_out$n_cites),c(1:5,72)]
s_outcites <- single_outcites[,c("REFID", "n_cites")]
colnames(s_outcites) <- c("ref", "n")
s_outcites <- subset(s_outcites[-100,], n>=5)
q <- ggplot(s_outcites, aes(x=ref, y = n))
q + geom_lollipop(aes(reorder(ref, -n)),color = "coral3", cex=1) +
coord_flip() +
labs(x="Reference", y="Number of citations from the corpus (>=5)")
Who cited Nitsch (2005)?
who <- cites_out[cites_out$REFID == "Nitsch_2005_Journal_Urban_Economics",6:71]
Who did not cite G. K. Zipf himself?
nozipf<-[cites_out$REFID %in% c("Zipf_1941_Unity_disunity",
colnames(nozipf) <- "ref_zipf"
nozipf$ref_zipf <- ifelse(nozipf$ref_zipf == 0 , "no ref to zipf", "ref to zipf")
nozipf$REFID <- rownames(nozipf)
cites_for_ref_zipfs <- cites[1:66,]
referencing_zipf <- data.frame(cites_for_ref_zipfs, nozipf[match(cites_for_ref_zipfs$REFID, nozipf$REFID),])
rfczpf <- referencing_zipf[,c("YEAR", "DISCIPLINE", "ref_zipf")]
q <- ggplot(rfczpf, aes(x=YEAR))
q + geom_histogram(aes(y = stat(density), color = ref_zipf, fill = ref_zipf),
alpha = 0.4, position = "identity", binwidth = 1) +
geom_density(aes(color = ref_zipf), size = 1) +
scale_color_manual(values = c("Coral2", "dodgerblue3")) +
scale_fill_manual(values = c("Coral2", "dodgerblue3")) + labs(color = "") + guides(fill = FALSE, size = FALSE)+ theme(legend.position = "top")
table(rfczpf$ref_zipf, rfczpf$DISCIPLINE)
no ref to zipf 0 11 5 0 10 0 0
ref to zipf 0 12 8 4 16 0 0
Which are the authors cited from Urban Studies?
cites_out[cites_out$JOURNAL == "Urban Studies","AUTHOR"]
[1] Alperovich Batten Begg Bergsman Greenston Healy
[5] Boddy Chen Fu Zhang Cheshire Carbonaro Clark Stabler
[9] Ettlinger Garmestani Allen Gallagher al Garmestani Allen Gallagher al Hsu
[13] Huzinec Kuiper Kuiper Paelink Lanaspa Pueyo Sans Lipshitz
[17] Marshall Ogawa Parr Parr Suzuki
[21] Resende SuarezVilla Thompson VonBoventer
[25] Xu Zhu Zhao Zhang <NA>
987 Levels: Alperovich G. Amalraj V. C. & Subbarayan A. Anderson G. & Ge Y. Aragon J. A. O. & Queiroz V. Arribas-Bel D. et al. ... Zipf
Size of bibliography of corpus articles
n_citations <-[,6:71], na.rm=T))
colnames(n_citations) <- "n_citations"
n_citations$REFID <- rownames(n_citations)
n_citations_corpus <- data.frame(cites_for_ref_zipfs, n_citations[match(cites_for_ref_zipfs$REFID, n_citations$REFID),])
n_citations_corpus$ref <- paste(n_citations_corpus$AUTHOR, n_citations_corpus$YEAR, sep=" ")
q <- ggplot(n_citations_corpus, aes(x=ref, y = n_citations))
q + geom_lollipop(aes(reorder(ref, -n_citations)),color = "dodgerblue3", cex=1) +
coord_flip() +
labs(x="Reference", y="Size of bibliography by corpus article")
Most cited external aticles
jou_out <-$JOURNAL))
jou_out <- jou_out[order(-jou_out$Freq),]
colnames(jou_out) <- c("ref", "n")
jou <- subset(jou_out[-100,], n>=5)
q <- ggplot(jou, aes(x=ref, y = n))
q + geom_lollipop(aes(reorder(ref, -n)),color = "goldenrod3", cex=1) +
coord_flip() +
labs(x="Journal", y="Number of citations from the corpus (>=5)")
outcitemat <- as.matrix(cites_out[,6:71])
toutcitemat <- t(outcitemat)
refSim <- rownames(toutcitemat)
cosSim <- data.frame()
ilist <- c()
for(i in refSim){
ilist <- c(ilist, i)
for(j in refSim){
if (j %!in% ilist){
k <- k + 1
cosSim[k,1] <- i
cosSim[k,2] <- j
vi <- toutcitemat[i,]
vj <- toutcitemat[j,]
cosSim[k,3] <- (sumNum(vi * vj)) / (norm_vec(vi) * norm_vec(vj))
colnames(cosSim) <- c('i', 'j', 'cosSim')
write.csv(cosSim[order(-cosSim$cosSim),], "simNets/SimilarCitations.csv")
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.04550 0.06399 0.10020 0.48670
cs.cit <- cosSim[cosSim$cosSim >= 0.25,]
g.cit <- graph_from_data_frame(cs.cit, directed=F)
citingNs <-[rownames(toutcitemat) %in% V(g.cit)$name,],1, FUN = norm_vec))
colnames(citingNs) <- "citingN"
citingNs$ref <- rownames(citingNs)
orderedName <- data.frame(V(g.cit)$name)
citingN <- data.frame(orderedName, citingNs[match(orderedName$, citingNs$ref),])[,"citingN"]
citingN <- citingN[!]
par(mar = c(0,0,0,0))
clln.cit <- cluster_louvain(g.cit)
layout <- layout_nicely(g.cit,2)
g.cit$layout <- layout
plot(g.cit, edge.width = sqrt(cs.cit$cosSim) * 2, vertex.size = citingN,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(clln.cit))
discipline2ref <- aggregate(cites_out[,c(6:71)], by=list(cites_out$DISCIPLINE), FUN = sumNum)
rownames(discipline2ref) <- discipline2ref$Group.1
discipline2ref$Group.1 <- NULL
cols <- colnames(discipline2ref)
for(i in cols){
absolute <- discipline2ref[,i]
total <- sum(absolute)
relative <- absolute/total
discipline2ref[,paste0("freq_",i)] <- relative
disc2ref <- as.matrix(discipline2ref[,67:132])
colnames(disc2ref) <- cols
scaled_disc2ref <- disc2ref / colSums(disc2ref)
tdisc2ref <- t(disc2ref)
refSim <- rownames(tdisc2ref)
cosSim <- data.frame()
ilist <- c()
for(i in refSim){
ilist <- c(ilist, i)
for(j in refSim){
if (j %!in% ilist){
k <- k + 1
cosSim[k,1] <- i
cosSim[k,2] <- j
vi <- tdisc2ref[i,]
vj <- tdisc2ref[j,]
cosSim[k,3] <- (sumNum(vi * vj)) / (norm_vec(vi) * norm_vec(vj))
colnames(cosSim) <- c('i', 'j', 'cosSim')
write.csv(cosSim[order(-cosSim$cosSim),], "simNets/SimilarDisciplinesCited_rel.csv")
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.07909 0.61930 0.79130 0.73710 0.88950 0.99690
cs.cit <- cosSim[cosSim$cosSim >= 0.9,]
g.cit <- graph_from_data_frame(cs.cit, directed=F)
disCitedNs <-[rownames(tdisc2ref) %in% V(g.cit)$name,],1, FUN = norm_vec))
colnames(disCitedNs) <- "disCitedNs"
disCitedNs$ref <- rownames(disCitedNs)
orderedName <- data.frame(V(g.cit)$name)
disCitedNs <- data.frame(orderedName, disCitedNs[match(orderedName$, disCitedNs$ref),])[,"disCitedNs"]
disCitedNs <- disCitedNs[!]
colnames(orderedName) <- "REFID"
refinfo <- cites[1:66,c("REFID", "DISCIPLINE")]
own_discipline <- data.frame(orderedName, refinfo[match(orderedName$REFID, refinfo$REFID),])
odisc <- own_discipline$DISCIPLINE
layout <- layout_nicely(g.cit,2)
g.cit$layout <- layout
par(mar = c(0,0,0,0))
plot(g.cit, edge.width = sqrt(cs.cit$cosSim) * 2, vertex.size = disCitedNs,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = odisc)
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, "`")
docs <- tm_map(docs, toSpace, "‘")
docs <- tm_map(docs, toSpace, "_")
docs <- tm_map(docs, toSpace, "–")
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removeWords, stopwords("english"))
dtm <- DocumentTermMatrix(docs)
dtm.m = as.matrix(dtm)
tdm <- TermDocumentMatrix(docs)
tdm.m = as.matrix(tdm)
tdm.df =
tdm.df.f <- tdm.df / colSums(tdm.df)
n_words <-, na.rm=T))
colnames(n_words) <- "n_words"
n_words$REFID <- substr(rownames(n_words), 1,8)
n_words_corpus <- data.frame(cites_for_ref_zipfs, n_words[match(cites_for_ref_zipfs$REFID, n_words$REFID),])
n_words_corpus$ref <- paste(n_words_corpus$AUTHOR, n_words_corpus$YEAR, sep=" ")
n_words_corpus <- n_words_corpus[n_words_corpus$AUTHOR != "Krakover S.",]
q <- ggplot(n_words_corpus, aes(x=ref, y = n_words))
q + geom_lollipop(aes(reorder(ref, -n_words)),color = "seagreen4", cex=1) +
coord_flip() +
labs(x="Reference", y="Number of words by corpus article")
termmat <- t(as.matrix(tdm.df.f))
refSimTerm <- rownames(termmat)
cosSimTerm <- data.frame()
ilist <- c()
for(i in refSimTerm){
ilist <- c(ilist, i)
for(j in refSimTerm){
if (j %!in% ilist){
k <- k + 1
cosSimTerm[k,1] <- i
cosSimTerm[k,2] <- j
vi <- termmat[i,]
vj <- termmat[j,]
cosSimTerm[k,3] <- (sumNum(vi * vj)) / (norm_vec(vi) * norm_vec(vj))
colnames(cosSimTerm) <- c('i', 'j', 'cosSimTerm')
write.csv(cosSimTerm[order(-cosSimTerm$cosSimTerm),], "simNets/SimilarWording.csv")
i j cosSimTerm
Length:2145 Length:2145 Min. :0.1229
Class :character Class :character 1st Qu.:0.3685
Mode :character Mode :character Median :0.4741
Mean :0.4747
3rd Qu.:0.5777
Max. :0.8728
cs <- cosSimTerm[cosSimTerm$cosSimTerm > 0.7,]
g.term <- graph_from_data_frame(cs, directed=F)
totalTerm <-[rownames(termmat) %in% V(g.term)$name,],1, FUN = norm_vec))
colnames(totalTerm) <- "totalTerms"
totalTerm$ref <- rownames(totalTerm)
orderedName <- data.frame(V(g.term)$name)
totalTerms <- data.frame(orderedName, totalTerm[match(orderedName$, totalTerm$ref),])[,"totalTerms"]
totalTerms <- totalTerms[!]
clln.term <- cluster_louvain(g.term)
layout <- layout_nicely(g.term,2)
g.term$layout <- layout
par(mar = c(0,0,0,0))
plot(g.term, edge.width = sqrt(cs$cosSimTerm) * 2, vertex.size = totalTerms *50,
vertex.label.cex = 0.7,
edge.curved=.2, vertex.color = membership(clln.term))
paperList <- colnames(cites_out[,6:71])
countryData <- data[data$TERRITORY_TYPE == "Country" & data$REFID %in% paperList,]
country2Ref <- table(countryData$CNTR_ID, countryData$REFID)[-1,]
country2Ref.mat <- as.matrix(country2Ref[rowSums(country2Ref)>0,colnames(country2Ref) %in% paperList])
country2Ref.mat[country2Ref.mat>0] <- 1
countrymat <- t(country2Ref.mat)
refSim <- rownames(countrymat)
cosSimC <- data.frame()
ilist <- c()
for(i in refSim){
ilist <- c(ilist, i)
for(j in refSim){
if (j %!in% ilist){
k <- k + 1
cosSimC[k,1] <- i
cosSimC[k,2] <- j
vi <- countrymat[i,]
vj <- countrymat[j,]
s <- (sumNum(vi * vj)) / (norm_vec(vi) * norm_vec(vj))
cosSimC[k,3] <- ifelse(!, s, 0)
colnames(cosSimC) <- c('i', 'j', 'cosSimC')
write.csv(cosSimC[order(-cosSimC$cosSimC),], "simNets/SimilarCountries.csv")
i j cosSimC
Length:2080 Length:2080 Min. :0.00000
Class :character Class :character 1st Qu.:0.00000
Mode :character Mode :character Median :0.00000
Mean :0.09676
3rd Qu.:0.00000
Max. :1.00000
cs.cntr <- cosSimC[cosSimC$cosSimC >= 0.2,]
g.cntr <- graph_from_data_frame(cs.cntr, directed=F)
countryN <-[rownames(countrymat) %in% V(g.cntr)$name,],1, FUN = norm_vec))
colnames(countryN) <- "n_countries"
countryN$ref <- rownames(countryN)
orderedName <- data.frame(V(g.cntr)$name)
orderedCountryN <- data.frame(orderedName, countryN[match(orderedName$, countryN$ref),])[,"n_countries"]
orderedCountryN <- orderedCountryN[!]
clln.cntr <- cluster_louvain(g.cntr)
layout <- layout_nicely(g.cntr,2)
g.cntr$layout <- layout
par(mar = c(0,0,0,0))
plot(g.cntr, edge.width = sqrt(cs.cntr$cosSimC) * 2, vertex.size = orderedCountryN * 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(clln.cntr))
cityData <- data[!$URBANSCALE) & data$REFID %in% paperList,]
city2Ref <- table(cityData$URBANSCALE, cityData$REFID)[,]
city2Ref.mat <- as.matrix(city2Ref[rowSums(city2Ref)>0,colnames(city2Ref) %in% paperList])
city2Ref.mat[city2Ref.mat>0] <- 1
citymat <- t(city2Ref.mat)
refSim <- rownames(citymat)
cosSimCt <- data.frame()
ilist <- c()
for(i in refSim){
ilist <- c(ilist, i)
for(j in refSim){
if (j %!in% ilist){
k <- k + 1
cosSimCt[k,1] <- i
cosSimCt[k,2] <- j
vi <- citymat[i,]
vj <- citymat[j,]
s <- (sumNum(vi * vj)) / (norm_vec(vi) * norm_vec(vj))
cosSimCt[k,3] <- ifelse(!, s, 0)
colnames(cosSimCt) <- c('i', 'j', 'cosSimCt')
write.csv(cosSimCt[order(-cosSimCt$cosSimCt),], "simNets/SimilarCities.csv")
i j cosSimCt
Length:2080 Length:2080 Min. :0.0000
Class :character Class :character 1st Qu.:0.0000
Mode :character Mode :character Median :0.0000
Mean :0.4247
3rd Qu.:1.0000
Max. :1.0000 <- cosSimCt[cosSimCt$cosSimCt >= 0.1,] <- graph_from_data_frame(, directed=F)
cityN <-[rownames(citymat) %in% V($name,],1, FUN = norm_vec))
colnames(cityN) <- "n_cities"
cityN$ref <- rownames(cityN)
orderedName <- data.frame(V($name)
orderedcityN <- data.frame(orderedName, cityN[match(orderedName$, cityN$ref),])[,"n_cities"]
orderedcityN <- orderedcityN[!] <- cluster_louvain(
layout <- layout_nicely(,2)$layout <- layout
par(mar = c(0,0,0,0))
plot(, edge.width = sqrt($cosSimCt) * 2, vertex.size = orderedcityN * 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(
decadeData <- data[!$DATE) & data$REFID %in% paperList,]
decadeData$DECADE <- paste0(substr(as.character(decadeData$DATE), 1, 3), 0, "s")
decade2Ref <- table(decadeData$DECADE, decadeData$REFID)[,]
decade2Ref.mat <- as.matrix(decade2Ref[rowSums(decade2Ref)>0,colnames(decade2Ref) %in% paperList])
decade2Ref.mat[decade2Ref.mat>0] <- 1
decademat <- t(decade2Ref.mat)
refSim <- rownames(decademat)
cosSimCt <- data.frame()
ilist <- c()
for(i in refSim){
ilist <- c(ilist, i)
for(j in refSim){
if (j %!in% ilist){
k <- k + 1
cosSimCt[k,1] <- i
cosSimCt[k,2] <- j
vi <- decademat[i,]
vj <- decademat[j,]
s <- (sumNum(vi * vj)) / (norm_vec(vi) * norm_vec(vj))
cosSimCt[k,3] <- ifelse(!, s, 0)
colnames(cosSimCt) <- c('i', 'j', 'cosSimCt')
write.csv(cosSimCt[order(-cosSimCt$cosSimCt),], "simNets/SimilarDecades.csv")
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.3780 0.3639 0.5774 1.0000
cs.decade <- cosSimCt[cosSimCt$cosSimCt >= 0.65,]
g.decade <- graph_from_data_frame(cs.decade, directed=F)
decadeN <-[rownames(decademat) %in% V(g.decade)$name,],1, FUN = norm_vec))
colnames(decadeN) <- "n_decades"
decadeN$ref <- rownames(decadeN)
orderedName <- data.frame(V(g.decade)$name)
ordereddecadeN <- data.frame(orderedName, decadeN[match(orderedName$, decadeN$ref),])[,"n_decades"]
ordereddecadeN <- ordereddecadeN[!]
clln.decade <- cluster_louvain(g.decade)
layout <- layout_nicely(g.decade,2)
g.decade$layout <- layout
par(mar = c(0,0,0,0))
plot(g.decade, edge.width = sqrt(cs.decade$cosSimCt) * 2, vertex.size = ordereddecadeN * 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(clln.decade))
alphaData <- data[!$ALPHALOTKA) & data$REFID %in% paperList,]
alphaData$n <- 1
meanAlphaPerRef <- aggregate(alphaData[,"ALPHALOTKA"], by=list(alphaData$REFID), FUN=meanNum)
sdAlphaPerRef <- aggregate(alphaData[,"ALPHALOTKA"], by=list(alphaData$REFID), FUN=sdNum)
sdAlphaPerRef[$x),"x"] <- 0
nAlphaPerRef <- aggregate(alphaData[,"n"], by=list(alphaData$REFID), FUN=sumNum)
similarAlpha <- cbind(meanAlphaPerRef, sdAlphaPerRef, nAlphaPerRef)
rownames(similarAlpha) <- similarAlpha$Group.1
similarAlpha[,c(1,3, 5)] <- NULL
colnames(similarAlpha) <- c("meanAlpha", "sdAlpha", "nAlpha")
alphamat <- as.matrix(similarAlpha)
meanData <- mean(alphaData[,"ALPHALOTKA"])
sdData <- sd(alphaData[,"ALPHALOTKA"])
meanN <- mean(similarAlpha$nAlpha)
refSim <- rownames(alphamat)
diff_mean <- data.frame()
diff_sd <- data.frame()
diff_n <- data.frame()
ilist <- c()
for(i in refSim){
ilist <- c(ilist, i)
for(j in refSim){
if (j %!in% ilist){
k <- k + 1
diff_mean[k,1] <- i
diff_mean[k,2] <- j
mi <- similarAlpha[i,1]
mj <- similarAlpha[j,1]
m <- abs((mi - mj) / meanData)
diff_mean[k,3] <- m
diff_sd[k,1] <- i
diff_sd[k,2] <- j
sdi <-similarAlpha[i,2]
sdj <- similarAlpha[j,2]
sd <- abs((sdi - sdj) / sdData)
diff_sd[k,3] <- sd
diff_n[k,1] <- i
diff_n[k,2] <- j
ni <-similarAlpha[i,3]
nj <- similarAlpha[j,3]
n <- abs((ni - nj) / meanN)
diff_n[k,3] <- n
colnames(diff_mean) <- c('i', 'j', 'diff_mean')
write.csv(diff_mean[order(diff_mean$diff_mean),], "simNets/diff_mean.csv")
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0003241 0.0773200 0.1593000 0.2142000 0.2773000 1.2080000
colnames(diff_sd) <- c('i', 'j', 'diff_sd')
write.csv(diff_sd[order(diff_sd$diff_sd),], "simNets/diff_sd.csv")
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.1345 0.3152 0.4388 0.5466 3.1350
colnames(diff_n) <- c('i', 'j', 'diff_n')
write.csv(diff_n[order(diff_n$diff_n),], "simNets/diff_n.csv")
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.2305 0.6454 1.2150 1.3830 9.0820
cs.alpha <- diff_mean[diff_mean$diff_mean < 0.025,]
g.alpha <- graph_from_data_frame(cs.alpha, directed=F)
alphaM <-[rownames(alphamat) %in% V(g.alpha)$name,],1, FUN = norm_vec))
colnames(alphaM) <- "mean_alphas"
alphaM$ref <- rownames(alphaM)
orderedName <- data.frame(V(g.alpha)$name)
orderedalphaM <- data.frame(orderedName, alphaM[match(orderedName$, alphaM$ref),])[,"mean_alphas"]
orderedalphaM <- orderedalphaM[!]
similarAlpha$REFID <- rownames(similarAlpha)
clln.alpha <- cluster_louvain(g.alpha)
layout <- layout_nicely(g.alpha,2)
g.alpha$layout <- layout
memb <- as.list(membership(clln.alpha))
vertexData <- data.frame(orderedREFID= names(membership(clln.alpha)))
vertexData <- data.frame(vertexData, similarAlpha[match(vertexData$orderedREFID, similarAlpha$REFID),])
for (r in vertexData$REFID){
vertexData[vertexData$REFID == r,"memb_mean"] <- memb[r]
par(mar = c(0,0,0,0))
plot(g.alpha, edge.width = sqrt(cs.alpha$diff_mean) * 2, vertex.size = vertexData$meanAlpha^2 * 5,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(clln.alpha))
# mean average value of alpha reported by group
aggregate(vertexData[,"meanAlpha"], by=list(vertexData$memb_mean), FUN=meanNum)
cs.alpha <- diff_sd[diff_sd$diff_sd < 0.1,]
g.alpha <- graph_from_data_frame(cs.alpha, directed=F)
alphaSD <-[rownames(alphamat) %in% V(g.alpha)$name,],1, FUN = norm_vec))
colnames(alphaSD) <- "mean_sds"
alphaSD$ref <- rownames(alphaSD)
orderedName <- data.frame(V(g.alpha)$name)
orderedalphaSD <- data.frame(orderedName, alphaSD[match(orderedName$, alphaSD$ref),])[,"mean_sds"]
orderedalphaSD <- orderedalphaSD[!]
clln.alpha <- cluster_louvain(g.alpha)
layout <- layout_nicely(g.alpha,2)
g.alpha$layout <- layout
memb <- as.list(membership(clln.alpha))
vertexData <- data.frame(orderedREFID= names(membership(clln.alpha)))
vertexData <- data.frame(vertexData, similarAlpha[match(vertexData$orderedREFID, similarAlpha$REFID),])
for (r in vertexData$REFID){
vertexData[vertexData$REFID == r,"memb_sd"] <- memb[r]
par(mar = c(0,0,0,0))
plot(g.alpha, edge.width = sqrt(cs.alpha$diff_sd) * 2, vertex.size = vertexData$sdAlpha * 50,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(clln.alpha))
# mean average value of alpha reported by group
aggregate(vertexData[,"sdAlpha"], by=list(vertexData$memb_sd), FUN=meanNum)
cs.alpha <- diff_n[diff_n$diff_n < 0.1,]
g.alpha <- graph_from_data_frame(cs.alpha, directed=F)
alphaN <-[rownames(alphamat) %in% V(g.alpha)$name,],1, FUN = norm_vec))
colnames(alphaN) <- "mean_ns"
alphaN$ref <- rownames(alphaN)
orderedName <- data.frame(V(g.alpha)$name)
orderedalphaN <- data.frame(orderedName, alphaN[match(orderedName$, alphaN$ref),])[,"mean_ns"]
orderedalphaN <- orderedalphaN[!]
clln.alpha <- cluster_louvain(g.alpha)
layout <- layout_nicely(g.alpha,2)
g.alpha$layout <- layout
memb <- as.list(membership(clln.alpha))
vertexData <- data.frame(orderedREFID= names(membership(clln.alpha)))
vertexData <- data.frame(vertexData, similarAlpha[match(vertexData$orderedREFID, similarAlpha$REFID),])
for (r in vertexData$REFID){
vertexData[vertexData$REFID == r,"memb_n"] <- memb[r]
par(mar = c(0,0,0,0))
plot(g.alpha, edge.width = sqrt(cs.alpha$diff_n) * 2, vertex.size = vertexData$nAlpha * 0.4,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = membership(clln.alpha))
# mean average value of alpha reported by group
aggregate(vertexData[,"nAlpha"], by=list(vertexData$memb_n), FUN=meanNum)
# import diff in alpha means
meanAlphaDyads <- read.csv2("simNets/diff_mean.csv", sep=",", stringsAsFactors = F)[,-1]
meanAlphaDyads$dyadID <- paste(meanAlphaDyads$i, meanAlphaDyads$j, sep = "_")
meanAlphaDyads$meanAlphaSim <- -as.numeric(meanAlphaDyads$diff_mean)
meanAlphaDyads[,1:3] <- NULL
DYADS <- meanAlphaDyads
DYAD_ID_order <- DYADS$dyadID
# import diff in alpha sd
sdAlphaDyads <- read.csv2("simNets/diff_sd.csv", sep=",", stringsAsFactors = F)[,-1]
sdAlphaDyads$dyadIJ <- paste(substr(sdAlphaDyads$i, 1, 8),
substr(sdAlphaDyads$j, 1, 8), sep = "_")
sdAlphaDyads$dyadJI <- paste(substr(sdAlphaDyads$i, 1, 8),
substr(sdAlphaDyads$j, 1, 8), sep = "_")
sdAlphaDyads$dyadID <- ifelse(sdAlphaDyads$dyadIJ %in% DYAD_ID_order, sdAlphaDyads$dyadIJ, sdAlphaDyads$dyadJI)
sdAlphaDyads$sdAlphaSim <- -as.numeric(sdAlphaDyads$diff_sd)
sdAlphaDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, sdAlphaDyads[match(DYADS$dyadID, sdAlphaDyads$dyadID),"sdAlphaSim"])
# import diff in n alpha
nAlphaDyads <- read.csv2("simNets/diff_n.csv", sep=",", stringsAsFactors = F)[,-1]
nAlphaDyads$dyadIJ <- paste(substr(nAlphaDyads$i, 1, 8),
substr(nAlphaDyads$j, 1, 8), sep = "_")
nAlphaDyads$dyadJI <- paste(substr(nAlphaDyads$i, 1, 8),
substr(nAlphaDyads$j, 1, 8), sep = "_")
nAlphaDyads$dyadID <- ifelse(nAlphaDyads$dyadIJ %in% DYAD_ID_order, nAlphaDyads$dyadIJ, nAlphaDyads$dyadJI)
nAlphaDyads$nAlphaSim <- -as.numeric(nAlphaDyads$diff_n)
nAlphaDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, nAlphaDyads[match(DYADS$dyadID, nAlphaDyads$dyadID),"nAlphaSim"])
# import similarity in wording as explanation
wordingDyads <- read.csv2("simNets/SimilarWording.csv", sep=",", stringsAsFactors = F)[,-1]
wordingDyads$dyadIJ <- paste(substr(wordingDyads$i, 1, 8),
substr(wordingDyads$j, 1, 8), sep = "_")
wordingDyads$dyadJI <- paste(substr(wordingDyads$i, 1, 8),
substr(wordingDyads$j, 1, 8), sep = "_")
wordingDyads$dyadID <- ifelse(wordingDyads$dyadIJ %in% DYAD_ID_order, wordingDyads$dyadIJ, wordingDyads$dyadJI)
wordingDyads$wordingCosSim <- as.numeric(wordingDyads$cosSimTerm)
wordingDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, wordingDyads[match(DYADS$dyadID, wordingDyads$dyadID),"wordingCosSim"])
# import similarity in citation as explanation
citationDyads <- read.csv2("simNets/SimilarCitations.csv", sep=",", stringsAsFactors = F)[,-1]
citationDyads$dyadIJ <- paste(citationDyads$i, citationDyads$j, sep = "_")
citationDyads$dyadJI <- paste(citationDyads$j, citationDyads$i, sep = "_")
citationDyads$dyadID <- ifelse(citationDyads$dyadIJ %in% DYAD_ID_order, citationDyads$dyadIJ, citationDyads$dyadJI)
citationDyads$citationCosSim <- as.numeric(citationDyads$cosSim)
citationDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, citationDyads[match(DYADS$dyadID, citationDyads$dyadID),"citationCosSim"])
# import similarity in discipline as explanation
disciplineDyads <- read.csv2("simNets/SimilarDisciplinesCited_rel.csv", sep=",", stringsAsFactors = F)[,-1]
disciplineDyads$dyadIJ <- paste(disciplineDyads$i, disciplineDyads$j, sep = "_")
disciplineDyads$dyadJI <- paste(disciplineDyads$j, disciplineDyads$i, sep = "_")
disciplineDyads$dyadID <- ifelse(disciplineDyads$dyadIJ %in% DYAD_ID_order, disciplineDyads$dyadIJ, disciplineDyads$dyadJI)
disciplineDyads$disciplineCosSim <- as.numeric(disciplineDyads$cosSim)
disciplineDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, disciplineDyads[match(DYADS$dyadID, disciplineDyads$dyadID),"disciplineCosSim"])
# import similarity in countries studied as control
countryDyads <- read.csv2("simNets/SimilarCountries.csv", sep=",", stringsAsFactors = F)[,-1]
countryDyads$dyadIJ <- paste(countryDyads$i, countryDyads$j, sep = "_")
countryDyads$dyadJI <- paste(countryDyads$j, countryDyads$i, sep = "_")
countryDyads$dyadID <- ifelse(countryDyads$dyadIJ %in% DYAD_ID_order, countryDyads$dyadIJ, countryDyads$dyadJI)
countryDyads$countryCosSim <- as.numeric(countryDyads$cosSimC)
countryDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, countryDyads[match(DYADS$dyadID, countryDyads$dyadID),"countryCosSim"])
# import similarity in decades studied as control
decadeDyads <- read.csv2("simNets/SimilarDecades.csv", sep=",", stringsAsFactors = F)[,-1]
decadeDyads$dyadIJ <- paste(decadeDyads$i, decadeDyads$j, sep = "_")
decadeDyads$dyadJI <- paste(decadeDyads$j, decadeDyads$i, sep = "_")
decadeDyads$dyadID <- ifelse(decadeDyads$dyadIJ %in% DYAD_ID_order, decadeDyads$dyadIJ, decadeDyads$dyadJI)
decadeDyads$decadeCosSim <- as.numeric(decadeDyads$cosSimCt)
decadeDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, decadeDyads[match(DYADS$dyadID, decadeDyads$dyadID),"decadeCosSim"])
# import similarity in city definition used as control
cityDefDyads <- read.csv2("simNets/SimilarCities.csv", sep=",", stringsAsFactors = F)[,-1]
cityDefDyads$dyadIJ <- paste(cityDefDyads$i, cityDefDyads$j, sep = "_")
cityDefDyads$dyadJI <- paste(cityDefDyads$j, cityDefDyads$i, sep = "_")
cityDefDyads$dyadID <- ifelse(cityDefDyads$dyadIJ %in% DYAD_ID_order, cityDefDyads$dyadIJ, cityDefDyads$dyadJI)
cityDefDyads$cityDefCosSim <- as.numeric(cityDefDyads$cosSimCt)
cityDefDyads[,1:5] <- NULL
DYADS <- data.frame(DYADS, cityDefDyads[match(DYADS$dyadID, cityDefDyads$dyadID),"cityDefCosSim"])
colnames(DYADS) <- c("dyadID","meanAlphaSim", "sdAlphaSim", "nAlphaSim",
"fullTextCosSim", "externalCitationCosSim", "disciplineCitationCosSim",
"countriesCosSim", "decadesCosSim", "cityDefCosSim")
# scale all variables
DYADS$meanAlphaSim_scaled <- scale(DYADS$meanAlphaSim)
DYADS$sdAlphaSim_scaled <- scale(DYADS$sdAlphaSim)
DYADS$nAlphaSim_scaled <- scale(DYADS$nAlphaSim)
DYADS$fullTextCosSim_scaled <- scale(DYADS$fullTextCosSim)
DYADS$externalCitationCosSim_scaled <- scale(DYADS$externalCitationCosSim)
DYADS$disciplineCitationCosSim_scaled <- scale(DYADS$disciplineCitationCosSim)
DYADS$countriesCosSim_scaled <- scale(DYADS$countriesCosSim)
DYADS$decadesCosSim_scaled <- scale(DYADS$decadesCosSim)
DYADS$cityDefCosSim_scaled <- scale(DYADS$cityDefCosSim)
DYADS <- DYADS[complete.cases(DYADS),]
modeldata<- DYADS[,c("dyadID","meanAlphaSim_scaled", "sdAlphaSim_scaled", "nAlphaSim_scaled",
"fullTextCosSim_scaled", "externalCitationCosSim_scaled", "disciplineCitationCosSim_scaled",
"countriesCosSim_scaled", "decadesCosSim_scaled", "cityDefCosSim_scaled")]
colnames(modeldata) <- c("ID", "alpha", "sd_alpha", "n_estim","text", "citation", "discipline", "country", "decade", "city")
ftm_a <- lm(data = modeldata,
formula = alpha ~ text,
na.action = na.omit)
ecm_a <- lm(data = modeldata,
formula = alpha ~ citation,
na.action = na.omit)
dcm_a <- lm(data = modeldata,
formula = alpha ~ discipline,
na.action = na.omit)
ctrm_a <- lm(data = modeldata,
formula = alpha ~ n_estim +
country * decade * city,
na.action = na.omit)
fullmodel_a <- lm(data = modeldata,
formula = alpha ~
text + citation + discipline + n_estim +
country * decade * city,
na.action = na.omit)
sgm<- stargazer(ftm_a, ecm_a, dcm_a,
type="text", out="results/model_comp_results_meanAlpha.html", font.size = "small", column.sep.width = "1pt")
Dependent variable:
(1) (2) (3) (4) (5)
text 0.048** 0.050**
(0.022) (0.023)
citation 0.062*** 0.069***
(0.022) (0.024)
discipline 0.043* 0.015
(0.022) (0.024)
n_estim -0.053** -0.046**
(0.022) (0.022)
country 0.063*** 0.063***
(0.023) (0.023)
decade -0.100*** -0.123***
(0.022) (0.023)
city -0.001 -0.015
(0.022) (0.023)
country:decade -0.014 -0.013
(0.021) (0.021)
country:city 0.049** 0.053**
(0.022) (0.022)
decade:city 0.037* 0.038*
(0.022) (0.022)
country:decade:city -0.017 -0.014
(0.021) (0.021)
Constant 0.015 0.015 0.015 0.013 0.012
(0.022) (0.022) (0.022) (0.022) (0.022)
Observations 2,016 2,016 2,016 2,016 2,016
R2 0.002 0.004 0.002 0.021 0.030
Adjusted R2 0.002 0.003 0.001 0.017 0.025
Residual Std. Error 1.003 (df = 2014) 1.003 (df = 2014) 1.004 (df = 2014) 0.996 (df = 2007) 0.992 (df = 2004)
F Statistic 4.519** (df = 1; 2014) 7.799*** (df = 1; 2014) 3.682* (df = 1; 2014) 5.403*** (df = 8; 2007) 5.707*** (df = 11; 2004)
Note: *p<0.1; **p<0.05; ***p<0.01
ftm_sd <- lm(data = modeldata,
formula = sd_alpha ~ text,
na.action = na.omit)
ecm_sd <- lm(data = modeldata,
formula = sd_alpha ~ citation,
na.action = na.omit)
dcm_sd <- lm(data = modeldata,
formula = sd_alpha ~ discipline,
na.action = na.omit)
n_sd <- lm(data = modeldata,
formula = sd_alpha ~ n_estim,
na.action = na.omit)
ctrm_sd <- lm(data = modeldata,
formula = sd_alpha ~
country * decade * city,
na.action = na.omit)
fullmodel_sd <- lm(data = modeldata,
formula = sd_alpha ~
text + citation + discipline + n_estim +
country * decade * city,
na.action = na.omit)
sgm<- stargazer(ftm_sd, ecm_sd, dcm_sd,n_sd,
type="text", out="results/model_comp_results_sdAlpha.html", font.size = "small", column.sep.width = "1pt")
Dependent variable:
(1) (2) (3) (4) (5) (6)
text 0.112*** 0.116***
(0.022) (0.023)
citation -0.013 -0.006
(0.022) (0.024)
discipline -0.004 -0.009
(0.022) (0.024)
n_estim 0.134*** 0.133***
(0.022) (0.022)
country -0.096*** -0.078***
(0.023) (0.022)
decade -0.077*** -0.076***
(0.022) (0.023)
city 0.088*** 0.083***
(0.022) (0.022)
country:decade 0.041* 0.042**
(0.021) (0.021)
country:city 0.050** 0.051**
(0.022) (0.022)
decade:city 0.050** 0.051**
(0.022) (0.022)
country:decade:city -0.023 -0.016
(0.021) (0.021)
Constant -0.009 -0.009 -0.009 -0.008 -0.014 -0.013
(0.022) (0.022) (0.022) (0.022) (0.022) (0.022)
Observations 2,016 2,016 2,016 2,016 2,016 2,016
R2 0.012 0.0002 0.00002 0.018 0.027 0.055
Adjusted R2 0.012 -0.0003 -0.0005 0.017 0.023 0.050
Residual Std. Error 0.999 (df = 2014) 1.005 (df = 2014) 1.005 (df = 2014) 0.996 (df = 2014) 0.993 (df = 2008) 0.980 (df = 2004)
F Statistic 25.463*** (df = 1; 2014) 0.336 (df = 1; 2014) 0.032 (df = 1; 2014) 36.782*** (df = 1; 2014) 7.818*** (df = 7; 2008) 10.685*** (df = 11; 2004)
Note: *p<0.1; **p<0.05; ***p<0.01
modeldata$res <- fullmodel_a$residuals
modeldata$i <- substr(modeldata$ID, 1,8)
modeldata$j <- substr(modeldata$ID, 10, 17)
cs.residuals_resPos <- modeldata[modeldata$res > 1.1,c("i", "j", "res")]
g.residuals <- graph_from_data_frame(cs.residuals_resPos, directed=F)
resPos <-[rownames(alphamat) %in% V(g.residuals)$name,],1, FUN = norm_vec))
colnames(resPos) <- "residuals"
resPos$ref <- rownames(resPos)
orderedName <- data.frame(V(g.residuals)$name)
orderedresidualsRes <- data.frame(orderedName, resPos[match(orderedName$, resPos$ref),])[,"residuals"]
orderedresidualsRes <- orderedresidualsRes[!]
layout <- layout_nicely(g.residuals,2)
g.residuals$layout <- layout
par(mar = c(0,0,1,0))
plot(g.residuals, edge.width = sqrt(cs.residuals_resPos$res) * 2, vertex.size = 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = "green", edge.color = "coral3",
main = "Most positive residuals in mean alpha similarity")
cs.residuals_resNeg <- modeldata[modeldata$res < -2,c("i", "j", "res")]
g.residuals <- graph_from_data_frame(cs.residuals_resNeg, directed=F)
resNeg <-[rownames(alphamat) %in% V(g.residuals)$name,],1, FUN = norm_vec))
colnames(resNeg) <- "residuals"
resNeg$ref <- rownames(resNeg)
orderedName <- data.frame(V(g.residuals)$name)
orderedresidualsRes <- data.frame(orderedName, resNeg[match(orderedName$, resNeg$ref),])[,"residuals"]
orderedresidualsRes <- orderedresidualsRes[!]
layout <- layout_nicely(g.residuals,2)
g.residuals$layout <- layout
par(mar = c(0,0,1,0))
plot(g.residuals, edge.width = sqrt(cs.residuals_resNeg$res) * 2, vertex.size = 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = "orange", edge.color = "dodgerblue3",
main = "Most negative residuals in mean alpha similarity")
modeldata$res <- fullmodel_sd$residuals
modeldata$i <- substr(modeldata$ID, 1,8)
modeldata$j <- substr(modeldata$ID, 10, 17)
cs.residuals_resPos <- modeldata[modeldata$res > 1,c("i", "j", "res")]
g.residuals <- graph_from_data_frame(cs.residuals_resPos, directed=F)
resPos <-[rownames(alphamat) %in% V(g.residuals)$name,],1, FUN = norm_vec))
colnames(resPos) <- "residuals"
resPos$ref <- rownames(resPos)
orderedName <- data.frame(V(g.residuals)$name)
orderedresidualsRes <- data.frame(orderedName, resPos[match(orderedName$, resPos$ref),])[,"residuals"]
orderedresidualsRes <- orderedresidualsRes[!]
#clln.residuals <- cluster_louvain(g.residuals)
layout <- layout_nicely(g.residuals,2)
g.residuals$layout <- layout
par(mar = c(0,0,1,0))
plot(g.residuals, edge.width = sqrt(cs.residuals_resPos$res) * 2, vertex.size = 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = "green", edge.color = "coral3",
main = "Most positive residuals in sd alpha similarity")
cs.residuals_resNeg <- modeldata[modeldata$res < -1.5,c("i", "j", "res")]
g.residuals <- graph_from_data_frame(cs.residuals_resNeg, directed=F)
resNeg <-[rownames(alphamat) %in% V(g.residuals)$name,],1, FUN = norm_vec))
colnames(resNeg) <- "residuals"
resNeg$ref <- rownames(resNeg)
orderedName <- data.frame(V(g.residuals)$name)
orderedresidualsRes <- data.frame(orderedName, resNeg[match(orderedName$, resNeg$ref),])[,"residuals"]
orderedresidualsRes <- orderedresidualsRes[!]
layout <- layout_nicely(g.residuals,2)
g.residuals$layout <- layout
par(mar = c(0,0,1,0))
plot(g.residuals, edge.width = sqrt(cs.residuals_resNeg$res) * 2, vertex.size = 2,
vertex.label.cex = 0.7, edge.curved=.2, vertex.color = "orange", edge.color = "dodgerblue3",
main = "Most negative residuals in sd alpha similarity")