Skip to content

Commit e95ff50

Browse files
committed
Update changelog and dates
1 parent 2fde7c9 commit e95ff50

File tree

3 files changed

+28
-23
lines changed

3 files changed

+28
-23
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,12 @@ All notable changes to the SOTorrent dataset project will be documented in this
33

44
---
55

6-
## [2018-08-28] - Release for MSR Mining Challenge 2019, based on SO data dump 2018-06-05
6+
## [2018-09-23] - Second release for MSR Mining Challenge 2019, based on SO data dump 2018-09-05
7+
8+
* Update to Stack Overflow data dump 2018-09-05
9+
* Update `PostReferenceGH` (retrieved on 2018-09-23)
10+
11+
## [2018-08-28] - First release for MSR Mining Challenge 2019, based on SO data dump 2018-06-05
712

813
* Improve URL extraction (e.g., exclude matches in Markdown inline code, exclude invalid links)
914

bigquery/1_extract_so_references.sql

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
--- Status: 2018-08-28
1+
--- Status: 2018-09-23
22
--- Execute this in BigQuery
33

44
--- select all source code lines of text files that contain a link to Stack Overflow
@@ -31,7 +31,7 @@ FROM (
3131
)
3232
WHERE REGEXP_CONTAINS(line, r'(?i:https?://stackoverflow\.com/[^\s)\.\"]*)');
3333

34-
=> gh_so_references_2018_08_28.matched_lines
34+
=> gh_so_references_2018_09_23.matched_lines
3535

3636

3737
--- join with table "files" to get information about repos
@@ -44,11 +44,11 @@ SELECT
4444
size,
4545
url,
4646
line
47-
FROM `sotorrent-org.gh_so_references_2018_08_28.matched_lines` as lines
47+
FROM `sotorrent-org.gh_so_references_2018_09_23.matched_lines` as lines
4848
LEFT JOIN `bigquery-public-data.github_repos.files` as files
4949
ON lines.file_id = files.id;
5050

51-
=> gh_so_references_2018_08_28.matched_files
51+
=> gh_so_references_2018_09_23.matched_files
5252

5353

5454
--- normalize the SO links to (http://stackoverflow.com/(a/q)/<id>)
@@ -72,9 +72,9 @@ SELECT
7272
ELSE url
7373
END as url,
7474
line
75-
FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files`;
75+
FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files`;
7676

77-
=> gh_so_references_2018_08_28.matched_files_normalized
77+
=> gh_so_references_2018_09_23.matched_files_normalized
7878

7979

8080
--- extract post id from links, set post type id, and extract file extension from path
@@ -96,19 +96,19 @@ SELECT
9696
END as post_type_id,
9797
url,
9898
line
99-
FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files_normalized`
99+
FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files_normalized`
100100
WHERE
101101
REGEXP_CONTAINS(url, r'(http:\/\/stackoverflow\.com\/(?:a|q)\/[\d]+)');
102102

103-
=> gh_so_references_2018_08_28.matched_files_aq
103+
=> gh_so_references_2018_09_23.matched_files_aq
104104

105105

106106
--- use camel case for column names, add number of copies, and remove line content for export to MySQL database
107107
#standardSQL
108108
WITH
109109
copies AS (
110110
SELECT file_id, count(*) as copies
111-
FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files_aq`
111+
FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files_aq`
112112
GROUP BY file_id
113113
)
114114
SELECT
@@ -123,16 +123,16 @@ SELECT
123123
post_type_id as PostTypeId,
124124
url as SOUrl,
125125
CONCAT('https://raw.githubusercontent.com/', repo_name, "/", branch, "/", path) as GHUrl
126-
FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files_aq` files
126+
FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files_aq` files
127127
JOIN copies
128128
ON files.file_id = copies.file_id;
129129

130-
=> gh_so_references_2018_08_28.PostReferenceGH
130+
=> gh_so_references_2018_09_23.PostReferenceGH
131131

132132

133133

134134
###################################################################
135-
# the following tables are not present in gh_so_references_2018_08_28
135+
# the following tables are not present in gh_so_references_2018_09_23
136136
# will only be created on demand
137137
###################################################################
138138

@@ -155,7 +155,7 @@ WITH
155155
parent_id as ParentId,
156156
SOUrl,
157157
GHUrl
158-
FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH` ref
158+
FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH` ref
159159
LEFT JOIN `bigquery-public-data.stackoverflow.posts_answers` a
160160
ON ref.PostId = a.id
161161
WHERE PostTypeId=2
@@ -180,7 +180,7 @@ FROM answers
180180
LEFT JOIN `bigquery-public-data.stackoverflow.posts_questions` q
181181
ON answers.ParentId = q.id;
182182

183-
=> gh_so_references_2018_08_28.PostReferenceGH_Answers
183+
=> gh_so_references_2018_09_23.PostReferenceGH_Answers
184184

185185

186186
#standardSQL
@@ -192,9 +192,9 @@ SELECT
192192
CommentCount,
193193
Score,
194194
ParentViewCount
195-
FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH_Answers`;
195+
FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH_Answers`;
196196

197-
=> gh_so_references_2018_08_28.PostReferenceGH_Answers_R
197+
=> gh_so_references_2018_09_23.PostReferenceGH_Answers_R
198198

199199

200200
--- retrieve info about referenced SO questions
@@ -214,12 +214,12 @@ SELECT
214214
view_count as ViewCount,
215215
SOUrl,
216216
GHUrl
217-
FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH` ref
217+
FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH` ref
218218
LEFT JOIN `bigquery-public-data.stackoverflow.posts_questions` q
219219
ON ref.PostId = q.id
220220
WHERE PostTypeId=1;
221221

222-
=> gh_so_references_2018_08_28.PostReferenceGH_Questions
222+
=> gh_so_references_2018_09_23.PostReferenceGH_Questions
223223

224224

225225
#standardSQL
@@ -231,6 +231,6 @@ SELECT
231231
CommentCount,
232232
Score,
233233
ViewCount
234-
FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH_Questions`;
234+
FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH_Questions`;
235235

236-
=> gh_so_references_2018_08_28.PostReferenceGH_Questions_R
236+
=> gh_so_references_2018_09_23.PostReferenceGH_Questions_R

sotorrent/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@
1212

1313
## Data
1414

15-
The Stack Overflow data has been extracted from the official [Stack Exchange data dump](https://archive.org/details/stackexchange) released 2018-06-05.
15+
The Stack Overflow data has been extracted from the official [Stack Exchange data dump](https://archive.org/details/stackexchange) released 2018-09-05.
1616

17-
The GitHub references have been retrieved from the [Google BigQuery GitHub data set](https://cloud.google.com/bigquery/public-data/github) on 2018-08-28.
17+
The GitHub references have been retrieved from the [Google BigQuery GitHub data set](https://cloud.google.com/bigquery/public-data/github) on 2018-09-23.

0 commit comments

Comments
 (0)