Skip to content

Commit a644dbe

Browse files
committed
Fix dataset import
1 parent 18fe95a commit a644dbe

File tree

3 files changed

+245
-0
lines changed

3 files changed

+245
-0
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# Export tables from offical SO dump to CSV to be able to import them into BigQuery
2+
3+
# Users
4+
SELECT
5+
Id,
6+
Reputation,
7+
IFNULL(CreationDate, ''),
8+
IFNULL(DisplayName, ''),
9+
IFNULL(LastAccessDate, ''),
10+
IFNULL(WebsiteUrl, ''),
11+
IFNULL(Location, ''),
12+
IFNULL(ProfileImageUrl, ''),
13+
IFNULL(AboutMe, ''),
14+
IFNULL(Views, ''),
15+
IFNULL(UpVotes, ''),
16+
IFNULL(DownVotes, ''),
17+
IFNULL(Age, ''),
18+
IFNULL(AccountId, ''),
19+
IFNULL(EmailHash, '')
20+
INTO OUTFILE '<PATH>Users.csv'
21+
CHARACTER SET utf8mb4
22+
FIELDS TERMINATED BY ','
23+
OPTIONALLY ENCLOSED BY '\"'
24+
ESCAPED BY '\"'
25+
LINES TERMINATED BY '\n'
26+
FROM `Users`;
27+
28+
# Badges
29+
SELECT
30+
Id,
31+
UserId,
32+
IFNULL(Name, ''),
33+
IFNULL(Date, ''),
34+
IFNULL(Class, ''),
35+
IFNULL(TagBased, '')
36+
INTO OUTFILE '<PATH>Badges.csv'
37+
CHARACTER SET utf8mb4
38+
FIELDS TERMINATED BY ','
39+
OPTIONALLY ENCLOSED BY '\"'
40+
ESCAPED BY '\"'
41+
LINES TERMINATED BY '\n'
42+
FROM `Badges`;
43+
44+
# Posts
45+
SELECT
46+
Id,
47+
IFNULL(PostTypeId, ''),
48+
IFNULL(AcceptedAnswerId, ''),
49+
IFNULL(ParentId, ''),
50+
IFNULL(CreationDate, ''),
51+
IFNULL(DeletionDate, ''),
52+
IFNULL(Score, ''),
53+
IFNULL(ViewCount, ''),
54+
IFNULL(REPLACE(Body, '\n', '&#xD;&#xA;'), ''),
55+
IFNULL(OwnerUserId, ''),
56+
IFNULL(OwnerDisplayName, ''),
57+
IFNULL(LastEditorUserId, ''),
58+
IFNULL(LastEditorDisplayName, ''),
59+
IFNULL(LastEditDate, ''),
60+
IFNULL(LastActivityDate, ''),
61+
IFNULL(Title, ''),
62+
IFNULL(Tags, ''),
63+
IFNULL(AnswerCount, ''),
64+
IFNULL(CommentCount, ''),
65+
IFNULL(FavoriteCount, ''),
66+
IFNULL(ClosedDate, ''),
67+
IFNULL(CommunityOwnedDate, '')
68+
INTO OUTFILE '<PATH>Posts.csv'
69+
CHARACTER SET utf8mb4
70+
FIELDS TERMINATED BY ','
71+
OPTIONALLY ENCLOSED BY '\"'
72+
ESCAPED BY '\"'
73+
LINES TERMINATED BY '\n'
74+
FROM `Posts`;
75+
76+
# Comments
77+
SELECT
78+
Id,
79+
PostId,
80+
Score,
81+
IFNULL(REPLACE(Text, '\n', '&#xD;&#xA;'), ''),
82+
IFNULL(CreationDate, ''),
83+
IFNULL(UserDisplayName, ''),
84+
IFNULL(UserId, '')
85+
INTO OUTFILE '<PATH>Comments.csv'
86+
CHARACTER SET utf8mb4
87+
FIELDS TERMINATED BY ','
88+
OPTIONALLY ENCLOSED BY '\"'
89+
ESCAPED BY '\"'
90+
LINES TERMINATED BY '\n'
91+
FROM `Comments`;
92+
93+
# PostHistory
94+
SELECT
95+
Id,
96+
PostHistoryTypeId,
97+
PostId,
98+
IFNULL(RevisionGUID, ''),
99+
IFNULL(CreationDate, ''),
100+
IFNULL(UserId, ''),
101+
IFNULL(UserDisplayName, ''),
102+
IFNULL(REPLACE(Comment, '\n', '&#xD;&#xA;'), ''),
103+
IFNULL(REPLACE(Text, '\n', '&#xD;&#xA;'), '')
104+
INTO OUTFILE '<PATH>PostHistory.csv'
105+
CHARACTER SET utf8mb4
106+
FIELDS TERMINATED BY ','
107+
OPTIONALLY ENCLOSED BY '\"'
108+
ESCAPED BY '\"'
109+
LINES TERMINATED BY '\n'
110+
FROM `PostHistory`;
111+
112+
# PostLinks
113+
SELECT
114+
Id,
115+
IFNULL(CreationDate, ''),
116+
PostId,
117+
RelatedPostId,
118+
IFNULL(LinkTypeId, '')
119+
INTO OUTFILE '<PATH>PostLinks.csv'
120+
CHARACTER SET utf8mb4
121+
FIELDS TERMINATED BY ','
122+
OPTIONALLY ENCLOSED BY '\"'
123+
ESCAPED BY '\"'
124+
LINES TERMINATED BY '\n'
125+
FROM `PostLinks`;
126+
127+
# Tags
128+
SELECT
129+
Id,
130+
IFNULL(TagName, ''),
131+
IFNULL(Count, ''),
132+
IFNULL(ExcerptPostId, ''),
133+
IFNULL(WikiPostId, '')
134+
INTO OUTFILE '<PATH>Tags.csv'
135+
CHARACTER SET utf8mb4
136+
FIELDS TERMINATED BY ','
137+
OPTIONALLY ENCLOSED BY '\"'
138+
ESCAPED BY '\"'
139+
LINES TERMINATED BY '\n'
140+
FROM `Tags`;
141+
142+
# Votes
143+
SELECT
144+
Id,
145+
PostId,
146+
IFNULL(VoteTypeId, ''),
147+
IFNULL(UserId, ''),
148+
IFNULL(CreationDate, ''),
149+
IFNULL(BountyAmount, '')
150+
INTO OUTFILE '<PATH>Votes.csv'
151+
CHARACTER SET utf8mb4
152+
FIELDS TERMINATED BY ','
153+
OPTIONALLY ENCLOSED BY '\"'
154+
ESCAPED BY '\"'
155+
LINES TERMINATED BY '\n'
156+
FROM `Votes`;
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Add secure-file-priv="<output_path>" under [mysqld] in my.ini or /etc/mysql/mysql.conf.d/mysqld.cnf
2+
# to allow file export to that directory. Windows paths without backslashes, e.g., F:/Temp
3+
# Alternatively, disable secure-file-priv by setting it to ""
4+
# If AppArmor is activated for MySQL, the MySQL profile has to be modified to allow accessing /data/tmp/:
5+
# sudo nano /etc/apparmor.d/local/usr.sbin.mysqld
6+
# # Site-specific additions and overrides for usr.sbin.mysqld.
7+
# # For more details, please see /etc/apparmor.d/local/README.
8+
# /data/tmp/ r,
9+
# /data/tmp/** rwk,
10+
# sudo service apparmor reload
11+
# Alternative: Temporarily disable AppArmor for MySQL
12+
# (see, e.g., https://www.cyberciti.biz/faq/ubuntu-linux-howto-disable-apparmor-commands/)
13+
14+
SELECT Id, PostId, PostHistoryId, LocalId, PostBlockVersionId, PredPostHistoryId, PredLocalId, PredPostBlockVersionId, PostBlockDiffOperationId, REPLACE(Text, '\n', '&#xD;&#xA;')
15+
INTO OUTFILE '<PATH>PostBlockDiff.csv'
16+
CHARACTER SET utf8mb4
17+
FIELDS TERMINATED BY ','
18+
OPTIONALLY ENCLOSED BY '\"'
19+
ESCAPED BY '\"'
20+
LINES TERMINATED BY '\n'
21+
FROM `PostBlockDiff`;
22+
23+
SELECT Id, PostId, PostTypeId, PostHistoryId, PostHistoryTypeId, CreationDate, IFNULL(PredPostHistoryId, ''), IFNULL(SuccPostHistoryId, ''), MostRecentVersion, IFNULL(REPLACE(Comment, '\n', '&#xD;&#xA;'), '')
24+
INTO OUTFILE '<PATH>PostVersion.csv'
25+
CHARACTER SET utf8mb4
26+
FIELDS TERMINATED BY ','
27+
OPTIONALLY ENCLOSED BY '\"'
28+
ESCAPED BY '\"'
29+
LINES TERMINATED BY '\n'
30+
FROM `PostVersion`;
31+
32+
SELECT Id, PostBlockTypeId, PostId, PostHistoryId, LocalId, IFNULL(PredPostBlockVersionId, ''), IFNULL(PredPostHistoryId, ''), IFNULL(PredLocalId, ''), IFNULL(RootPostBlockVersionId, ''), IFNULL(RootPostHistoryId, ''), IFNULL(RootLocalId, ''), IFNULL(PredEqual, ''), IFNULL(PredSimilarity, ''), IFNULL(PredCount, ''), IFNULL(SuccCount, ''), Length, LineCount, REPLACE(Content, '\n', '&#xD;&#xA;'), MostRecentVersion
33+
INTO OUTFILE '<PATH>PostBlockVersion.csv'
34+
CHARACTER SET utf8mb4
35+
FIELDS TERMINATED BY ','
36+
OPTIONALLY ENCLOSED BY '\"'
37+
ESCAPED BY '\"'
38+
LINES TERMINATED BY '\n'
39+
FROM `PostBlockVersion`;
40+
41+
SELECT Id, PostId, PostHistoryId, PostBlockVersionId, LinkType, LinkPosition, REPLACE(IFNULL(FullMatch, ''), '\n', '&#xD;&#xA;'), Protocol, RootDomain, CompleteDomain, IFNULL(Path, ''), IFNULL(Query, ''), IFNULL(FragmentIdentifier, ''), Url, REPLACE(FullMatch, '\n', '&#xD;&#xA;')
42+
INTO OUTFILE '<PATH>PostVersionUrl.csv'
43+
CHARACTER SET utf8mb4
44+
FIELDS TERMINATED BY ','
45+
OPTIONALLY ENCLOSED BY '\"'
46+
ESCAPED BY '\"'
47+
LINES TERMINATED BY '\n'
48+
FROM `PostVersionUrl`;
49+
50+
SELECT Id, PostId, CommentId, LinkType, LinkPosition, REPLACE(IFNULL(FullMatch, ''), '\n', '&#xD;&#xA;'), Protocol, RootDomain, CompleteDomain, IFNULL(Path, ''), IFNULL(Query, ''), IFNULL(FragmentIdentifier, ''), Url, REPLACE(FullMatch, '\n', '&#xD;&#xA;')
51+
INTO OUTFILE '<PATH>CommentUrl.csv'
52+
CHARACTER SET utf8mb4
53+
FIELDS TERMINATED BY ','
54+
OPTIONALLY ENCLOSED BY '\"'
55+
ESCAPED BY '\"'
56+
LINES TERMINATED BY '\n'
57+
FROM `CommentUrl`;
58+
59+
SELECT Id, PostId, PostTypeId, PostHistoryId, PostHistoryTypeId, CreationDate, REPLACE(Title, '\n', ' '), IFNULL(PredPostHistoryId, ''), IFNULL(PredEditDistance, ''), IFNULL(SuccPostHistoryId, ''), IFNULL(SuccEditDistance, ''), MostRecentVersion
60+
INTO OUTFILE '<PATH>TitleVersion.csv'
61+
CHARACTER SET utf8mb4
62+
FIELDS TERMINATED BY ','
63+
OPTIONALLY ENCLOSED BY '\"'
64+
ESCAPED BY '\"'
65+
LINES TERMINATED BY '\n'
66+
FROM `TitleVersion`;
67+
68+
SELECT Id, PostId, PostTypeId, PostHistoryId, REPLACE(Content, '\n', '&#xD;&#xA;')
69+
INTO OUTFILE '<PATH>StackSnippetVersion.csv'
70+
CHARACTER SET utf8mb4
71+
FIELDS TERMINATED BY ','
72+
OPTIONALLY ENCLOSED BY '\"'
73+
ESCAPED BY '\"'
74+
LINES TERMINATED BY '\n'
75+
FROM `StackSnippetVersion`;
76+
77+
SELECT PostId, Version, ViewCount
78+
INTO OUTFILE '<PATH>PostViews.csv'
79+
CHARACTER SET utf8mb4
80+
FIELDS TERMINATED BY ','
81+
OPTIONALLY ENCLOSED BY '\"'
82+
ESCAPED BY '\"'
83+
LINES TERMINATED BY '\n'
84+
FROM `PostViews`;

sotorrent/import/biguery/schema/PostVersion.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,10 @@
4343
"mode": "REQUIRED",
4444
"name": "MostRecentVersion",
4545
"type": "BOOLEAN"
46+
},
47+
{
48+
"mode": "NULLABLE",
49+
"name": "Comment",
50+
"type": "STRING"
4651
}
4752
]

0 commit comments

Comments
 (0)