Skip to content

PR: Org Repos #131 #133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
132a46e
borrow baseline AWS Lambda function handler code from https://github.…
nelsonic Feb 4, 2024
ba4c009
add config to deploy lambda function #128
nelsonic Feb 4, 2024
5e6c3f1
debug(context) #128
nelsonic Feb 4, 2024
3193c4d
add dpl for lambda deployment #128
nelsonic Feb 4, 2024
2d65911
add "lambda/index.js" to files_to_deploy #128
nelsonic Feb 4, 2024
279be1b
move index.js (lambda function) to root of project #128
nelsonic Feb 4, 2024
99697b3
add env2 #128
nelsonic Feb 4, 2024
cf2e7f3
remind myself how to write a lambda function ... #128
nelsonic Feb 4, 2024
adc478d
load environment variables to give S3 debugging #128
nelsonic Feb 4, 2024
7812af4
attempt to debug event ... #128
nelsonic Feb 4, 2024
72df2ca
push new version to test debugging ... #128
nelsonic Feb 4, 2024
cef3903
run gs in lambda function #128
nelsonic Mar 30, 2024
750854c
install github-scraper as dependency #128
nelsonic Mar 30, 2024
86d362e
require github-scraper in lambda function #128
nelsonic Mar 30, 2024
3e818c3
return scraped data instead of event in lambda #128
nelsonic Mar 30, 2024
f28d061
log rawPath #128
nelsonic Mar 30, 2024
b8ae814
use event.rawPath as github-scraper url #128
nelsonic Mar 30, 2024
c2bb505
create lib/next_page_beta.js for detecting next page on the Org Beta …
nelsonic Apr 23, 2024
7587bec
create lib/org_repos.js for #131 (basic working!)
nelsonic Apr 23, 2024
b79a4e9
Org Repos (Beta) parser working #131
nelsonic Apr 23, 2024
e612c1c
disable contributions as client side rendered #132
nelsonic Apr 23, 2024
7fd29ad
/* istanbul ignore else */ for next_page_beta #131
nelsonic Apr 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ node_modules
.vagrant
crawl.js
.DS_Store

.env
tmp/
18 changes: 18 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
require("env2")(".env");
const debug = require("./lambda/debug.js");
const gs = require('github-scraper');

exports.handler = function handler (event, context, callback) {
console.log(event);
console.log("Hi Friends!")
debug(event);
console.log('rawPath:', event.rawPath)

const url = event.rawPath;
gs(url, function(err, data) {
console.log(' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ')
console.log(data);

return callback(null, data);
});
}
24 changes: 24 additions & 0 deletions lambda/debug.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
'use strict';
require('env2')('.env');
const save = require('./s3.js').save;

/**
* `debug` is used to debug SNS notification events.
* it only gets executed if the NODE_ENV is set to "test".
* To save event data to S3 you will need to add AWS_S3_BUCKET to .env
* see: github.com/dwyl/aws-ses-lambda/issues/12
* @param {Object} event - the object we want to store on S3
*/
module.exports = function debug (event) {
// console.log("process.env.NODE_ENV:", process.env.NODE_ENV);
if (process.env.NODE_ENV === "test") {
if(event.Records && !event.key) {
event.key = "sns";
}
save(event, function callback (error, data) {
console.log("DEBUG - - - error:", error, " - - - data:");
console.log(data);
console.log(" - - - - - - - - - - - - - - - - - - - - ");
});
}
};
39 changes: 39 additions & 0 deletions lambda/http_request.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
'use strict';

require("env2")(".env"); // ensure JWT_SECRET environment variable is defined.
const http = require('https'); // ALWAYS use TLS over the internets!
const jwt = require('jsonwebtoken');
/**
* simple_http_request is a bare-bones http request using node.js core http
* see: https://nodejs.org/api/http.html#http_http_request_options_callback
* @param {Object} json - the JSON data we want to send to the Phoenix App.
* @param {Function} callback - a standard callback with error & response args
* response is a JSON Object unless there is an error. No error handling yet ...
*/

module.exports = function simple_http_request (json, callback) {
const options = { // the json data is included in the token! 😮
headers: {
'Authorization': jwt.sign(json, process.env.JWT_SECRET),
'Accept': 'application/json'
},
hostname: process.env.EMAIL_APP_URL, // e.g: phemail.herokuapp.com
method: 'POST', // HTTP post sans body: stackoverflow.com/questions/4191593
port: '443',
path: '/api/sns' // the API endpoint that processes and stores SNS data
}

http.request(options, function (res) {
let resStr = '';
res.setEncoding('utf8');
res.on('data', function (chunk) {
resStr += chunk;
}).on('end', function () {
return callback(res.statusCode, JSON.parse(resStr));
});
})
// .on('error', (e) => {
// console.error(`problem with request: ${e.message}`);
// })
.end();
};
52 changes: 52 additions & 0 deletions lambda/s3.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
'use strict';
require('env2')('.env');
const AWS = require('aws-sdk');
AWS.config.region = 'eu-west-1';
var s3 = new AWS.S3({params: {Bucket: process.env.AWS_S3_BUCKET}});

/**
* `save` saves a JSON object to S3.
* if you need to specify the file name, use `json.key`
* @param {Object} json - the object we want to store on S3
* @param {Function} callback - called once the file has been uploaded
*/
module.exports.save = function save (json, callback) {
if (json) {
const filename = json.key || 'event'
const params = {
Key: filename + '.json',
Body: JSON.stringify(json),
ContentType: 'application/json',
ACL: 'public-read'
};

s3.upload(params, function (err, data) {
if (callback && typeof callback === "function") {
return callback(err, data);
}
else {
return data;
}
});

} else {
return callback('ERROR: please provide json data');
}
}

/**
* `get` retrieves and parses a JSON file from S3
* this function is only used to test that the `save` method.
* @param {String} key - the filename of the object to get from S3
* @param {Function} callback - called once the file has been uploaded
*/
module.exports.get = function get (key, callback) {
s3.getObject({Key: key}, function (error, data) {
if (error) {
return callback(error);
}
else {
return callback(error, JSON.parse(data.Body.toString()));
}
});
};
17 changes: 17 additions & 0 deletions lib/next_page_beta.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/**
* next_page checks for pagination on a "beta" page ref #131
* @param {Object} $ - cheerio object with DOM of page to be scraped
* @param {Object} data - the data we have scraped from the page so far
* @return {Object} the data object with a next_page key & value
*/
module.exports = function next_page_beta ($, data) {
const next = $('.TablePaginationSteps').find('[class^="Pagination__Page-"]').last().attr('href');
data.next_page = '';
/* istanbul ignore else */
if (next) {
const url = data.url.split('?')[0];
data.next_page = url + '?type=all&' + 'page=' + next.replace('#', '');
}

return data;
}
2 changes: 1 addition & 1 deletion lib/org.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ function org($, url, callback) {
name: $(parent + ' a').first().text().trim(),
lang: $(parent + 'span[itemprop=programmingLanguage]').first().text().trim(),
url: $(parent + ' a').first().attr('href'),
description: $(parent + 'p.d-inline-block').first().text().trim(),
description: $(parent + 'p[itemprop=description]').first().text().trim(),
updated: $(parent + ' relative-time')[0].attribs.datetime
});
});
Expand Down
46 changes: 46 additions & 0 deletions lib/org_repos.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/**
* `org_repos` parses a given GitHub organization repositories page.
* e.g: https://github.com/orgs/dwyl/repositories?type=all
* @param {object} $ - the cheerio DOM object.
* @param {string} url - the url of the page to be parsed.
* @param {function} callback - the callback we should call after scraping
* a callback passed into this method should accept two parameters:
* @param {objectj} error an error object (set to null if no error occurred)
* @param {object} data - the complete organsiation data
*/
function org_repos($, url, callback) {
var data = { url: url, type: 'org_repos' };
data.name = $('h1.lh-condensed').first().text().trim();
// data.description = $('h1.lh-condensed').parent().next().text().trim(); // yep ...¯\_(ツ)_/¯
data.description = $('.container-xl .color-fg-muted').first().text().trim()
// var people = $('.Counter').eq(1); // people is *second* in list of tabs!
// data.pcount = parseInt(people.first().text(), 10);
// data.pcount = isNaN(data.pcount) ? 0 : data.pcount
data.avatar = $('.avatar')[0].attribs.src;
var parts = data.avatar.split('/');
data.uid = parseInt(parts[parts.length-1].split('?')[0], 10);
// list of repos
var items = $('li.listviewitem');
// console.log('items.length', items.length);
data.entries = []; // avoid having circular reference objects! :-(
items.each( function (i) { // JS counters start at 0.
// console.log(i)
var parent = 'li:nth-child(' + (i+1) +') '; // CSS selectors start at 1.
console.log($(parent))
console.log($(parent + ' .markdown-title'))
data.entries.push({
// feel free to add more attributes to this! 🙏
name: $(parent + ' .markdown-title').text().trim(),
// lang: $(parent + ' .listview-item-main-content').find('[class^="Text-"]').text().trim(),
url: $(parent + ' a').first().attr('href'),
description: $(parent + ' .repos-list-description').first().text().trim(),
// updated: $(parent + ' relative-time')[0].attribs.datetime
});
});
// console.log(data)

data = require('./next_page_beta')($, data); // don't worry this gets cached ;-)
callback(null, data);
}

module.exports = org_repos
10 changes: 5 additions & 5 deletions lib/profile.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

const selectors=require('../config/repos')
const selectors = require('../config/repos')
/**
* profile method scrapes a given GitHub user profile
* @param {string} username - a valid GitHub username
Expand Down Expand Up @@ -46,11 +46,11 @@ module.exports = function profile ($, url, callback) {
data.website = $('[data-test-selector=profile-website-url] > a').attr("href")
// data.joined = $('.join-date').attr('datetime'); // Joined GitHub

// Contributions to Open Source in the past 12 months
data.contribs = parseInt($('.js-yearly-contributions').text().trim()
.split(' contributions')[0].replace(',', ''), 10);
// Contributions to Open Source in the past 12 months #132
// data.contribs = parseInt($('.js-yearly-contributions h2').text().trim()
// .split(' contributions')[0].replace(',', ''), 10);
// Contribution Matrix
data = require('./profile_contribs.js')($, data);
// data = require('./profile_contribs.js')($, data);

// List of (Public) organizations from profile
// data-hovercard-type="organization"
Expand Down
1 change: 1 addition & 0 deletions lib/scrapers.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module.exports = {
// labels : require('./labels'),
// milestones : require('./milestones'),
org: require('./org'),
org_repos: require('./org_repos'),
people: require('./people'),
profile: require('./profile'),
repo: require('./repo'),
Expand Down
4 changes: 4 additions & 0 deletions lib/switcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ module.exports = function switcher (url, callback) {
console.log('repos_user - - - - - - - - -')
scraper = 'repos_user';
}
// e.g: https://github.com/orgs/dwyl/repositories?type=all
else if(url.match(/org/) && url.match(/repositories/)) {
scraper = 'org_repos';
}
else if(url.match(/followers|following/)) {
scraper = 'followers'; // html/DOM is identical for these 2 pages!
}
Expand Down
3 changes: 0 additions & 3 deletions lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ function parse_int (str) {
, 10)
}

/**
* A library of utility functions for parsing web data.
*/
module.exports = {
parse_int: parse_int
}
Loading
Loading