Skip to content

Commit 19d3d79

Browse files
committed
code and figures.
1 parent 4e076fd commit 19d3d79

9 files changed

+536
-0
lines changed

output/plots/nytScree.pdf

4.91 KB
Binary file not shown.
179 KB
Loading
Loading

output/plots/nyt_vsp_centered_pcs.png

192 KB
Loading
194 KB
Loading

output/plots/rotInv.pdf

354 KB
Binary file not shown.

scripts/functions/vsp_fast.R

+418
Large diffs are not rendered by default.

scripts/makeFigure1.R

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
library(magrittr)
2+
n = 10000
3+
normDat = matrix(rnorm(n*2), ncol = 2) %>% scale()
4+
expDat = matrix(sample(c(-1,1), 2*n, T)*rexp(n*2)^1.3, ncol = 2) %>% scale()
5+
s = matrix(c(1, -2,-3,1), ncol = 2) %>% svd()
6+
R = s$u
7+
rexpDat = (expDat %*% R)
8+
sc = 3
9+
10+
11+
12+
pdf(file = "rotInv.pdf", width = 10, height = 3.5)
13+
par(mfrow =c(1,3), mar = c(.1,1,4,1), pty="s")
14+
plot(normDat, pch = ".", xlim = sc*c(-1,1), ylim=sc*c(-1,1), axes=F, ylab = "", xlab="",
15+
main = "Gaussian is\nrotationally invariant.", cex.main = 2)
16+
arrows(x0 = c(-3,0),y0 = c(0,-3), x1 = c(3,0),y1 = c(0,3), code =3, col = "grey43", lwd = 2)
17+
# lines(c(-99,099), c(0,0), col = "grey43", lwd = 2)
18+
# lines(c(0,0), c(-99,099), col = "grey43", lwd = 2)
19+
points(normDat, pch = ".")
20+
points(0,0, col = "red", cex=2, pch =19)
21+
22+
23+
24+
plot(rexpDat, pch = ".", xlim = sc*c(-1,1), ylim=sc*c(-1,1), axes=F, ylab = "", xlab="",
25+
main = "This non-Gaussian\nhas radial streaks.", cex.main = 2)
26+
arrows(x0 = c(-3,0),y0 = c(0,-3), x1 = c(3,0),y1 = c(0,3), code =3, col = "grey43", lwd = 2)
27+
# lines(c(-99,099), c(0,0), col = "grey43", lwd = 2)
28+
# lines(c(0,0), c(-99,099), col = "grey43", lwd = 2)
29+
points(rexpDat, pch = ".")
30+
points(0,0, col = "red", cex=2, pch =19)
31+
32+
33+
plot(rexpDat, pch = ".", xlim = sc*c(-1,1), ylim=sc*c(-1,1), axes=F, ylab = "", xlab="",
34+
main = "Varimax correctly\nestimates the basis.", cex.main = 2)
35+
v = varimax(rexpDat)
36+
stdBasis = matrix(c(1,0,-1,0,0,1,0,-1), byrow=T, ncol = 2)
37+
rb = stdBasis%*%R*3.5
38+
arrows(x0 = rb[c(1,3),1],y0 = rb[c(1,3),2], x1 = rb[-c(1,3),1],y1 = rb[-c(1,3),2], code =3, col = "grey43", lwd = 2)
39+
# lines(x = rb[1:2,1], y = rb[1:2,2], col = "grey43", lwd = 2)
40+
# lines(x = rb[3:4,1], y = rb[3:4,2], col = "grey43", lwd = 2)
41+
points(rexpDat, pch = ".")
42+
points(0,0, col = "red", cex=2, pch =19)
43+
dev.off()

scripts/makeNYTfigures.R

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# https://archive.ics.uci.edu/ml/datasets/bag+of+words
2+
library(data.table)
3+
library(Matrix)
4+
library(igraph)
5+
rm(list=ls())
6+
source("scripts/functions/vsp_fast.R")
7+
8+
# download the data from https://archive.ics.uci.edu/ml/datasets/Bag+of+Words
9+
# uncomment this next line to download the 223 MB data file.
10+
el = read_delim("https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz", skip = 3, delim = " ", col_names = F)
11+
el = as.tbl(el) %>% as.matrix
12+
A = spMatrix(nrow = max(el[,1]), ncol = max(el[,2]), i = el[,1], j = el[,2], x = el[,3]) %>% as("dgCMatrix")
13+
words = read_delim("https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.nytimes.txt", delim = "\n", col_names =F)$X1
14+
colnames(A)= words
15+
save(A, file = "data/nytA.RData")
16+
# load(file = "data/nytA.RData")
17+
18+
vsp_factors = vsp(A, k = 50, sym = F, centering = T)
19+
save(vsp_factors, file = "output/data/vsp_factors_50.RData")
20+
# load(file = "output/data/vsp_factors_50.RData")
21+
22+
23+
# Are any of the singular vectors localized on a few documents?
24+
U = vsp_factors$pcs$U
25+
l4 = colSums(U^4)^(1/4)
26+
plot(l4) #yes
27+
lines(c(0,10000), .15*c(1,1))
28+
29+
# localized singular vectors only highlight a small number of documents.
30+
# they are often an artifact of noise in sparse graphs.
31+
# https://papers.nips.cc/paper/8262-understanding-regularized-spectral-clustering-via-graph-conductance.pdf
32+
# the analysis below removes the singular vectors with 4th moment greater than .15.
33+
localized = l4>.15
34+
hist(U[,localized], breaks = 1000)
35+
hist(U[,!localized], breaks = 1000)
36+
37+
# remove the singular values that correspond to localized
38+
# singular vectors. then, make a screeplot
39+
pdf(file = "output/plots/nytScree.pdf", height = 4, width = 4)
40+
vsp_factors$pcs$scree[which(!localized)] %>%
41+
plot(main = "There is an eigengap \n at the 8th singular value",
42+
xlab = "Leading singular values")
43+
lines(8.5*c(1,1), c(0,10^9))
44+
dev.off()
45+
46+
# there is an eigengap at 8
47+
# so, these are the singular vectors we want to keep and rotate:
48+
which(!localized)[1:8]
49+
good = which(!localized)[1:8]
50+
# you can pass this to vsp_rotate, with the old vsp object,
51+
# setting k = good.
52+
53+
all_factors = vsp_factors
54+
vsp_factors = vsp_keep(all_factors, keepThese = good, fast = F, recenter=T)
55+
56+
57+
png(file = "output/plots/nyt_vsp_centered_pcs.png", height = 7, width = 7, units= "in", res= 200)
58+
plot(vsp_factors,k=8,whichPlot = "u",nsamp = 5000)
59+
dev.off()
60+
61+
62+
png(file = "output/plots/nyt_vsp_centered_factors.png", height = 7, width = 7, units= "in", res= 200)
63+
plot(vsp_factors,k=8,whichPlot = "z",nsamp = 5000)
64+
dev.off()
65+
66+
67+
png(file = "output/plots/nyt_vsp_centered_pcs_WORDS.png", height = 7, width = 7, units= "in", res= 200)
68+
plot(vsp_factors,k=8,whichPlot = "v",nsamp = 5000)
69+
dev.off()
70+
71+
72+
png(file = "output/plots/nyt_vsp_centered_factors_WORDS.png", height = 7, width = 7, units= "in", res= 200)
73+
plot(vsp_factors,k=8,whichPlot = "y",nsamp = 5000)
74+
dev.off()
75+

0 commit comments

Comments
 (0)