Skip to content

Commit

Permalink
cleaned up code
Browse files Browse the repository at this point in the history
  • Loading branch information
kevalmorabia97 committed Mar 15, 2019
1 parent d120603 commit fb4b00f
Show file tree
Hide file tree
Showing 14 changed files with 178 additions and 570 deletions.
3 changes: 0 additions & 3 deletions src/BurstySegmentExtractor.py → BurstySegmentExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
import json
from math import exp, sqrt, log10

from Segment import *
from TimeWindow import *


class BurstySegmentExtractor():
"""
Expand Down
53 changes: 29 additions & 24 deletions src/EventSegmentClusterer.py → EventSegmentClusterer.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,5 @@
import networkx as nx

def get_seg_similarity(bursty_segment_weights, time_window):
"""
return a dict of similarity between segments where keys are index of segment in bursty_segments
"""
print('Computing similarity between bursty segments')
seg_sim = {}
bursty_segments = list(bursty_segment_weights.keys())
n = len(bursty_segments)
for i in range(n):
seg_sim[i] = {}
seg_sim[i][i] = 1
seg1_name = bursty_segments[i]
print(i+1,seg1_name,str(bursty_segment_weights[seg1_name])[:7])
for j in range(i+1,n):
seg2_name = bursty_segments[j]
if j not in seg_sim: seg_sim[j] = {}
sim = time_window.get_segment_similarity(seg1_name, seg2_name)
seg_sim[i][j] = sim
seg_sim[j][i] = sim

return seg_sim

def get_events(bursty_segment_weights, segment_newsworthiness, seg_sim, n_neighbors=4, max_cluster_segments=20, threshold=4):
"""
Expand Down Expand Up @@ -65,15 +44,41 @@ def get_events(bursty_segment_weights, segment_newsworthiness, seg_sim, n_neighb
threshold_worthiness = max_event_worthiness/threshold

return [c for c in clusters if c[1]>threshold_worthiness]



def get_k_neighbors(k, seg, seg_sim):
"""
return set of k nearest neighbors of 'seg'
"""
neighbor_list = []
sim_list = [] # sim_list[i] = similarity of seg with neighbors[i]
sim_list = [] # sim_list[i] = similarity of seg with neighbor[i]
for i in seg_sim:
if i == seg: continue
neighbor_list.append(i)
sim_list.append(seg_sim[seg][i])
return set([x for _,x in sorted(zip(sim_list,neighbor_list), reverse=True)][:k])
return set([x for _,x in sorted(zip(sim_list,neighbor_list), reverse=True)][:k])


def get_seg_similarity(bursty_segment_weights, time_window):
"""
return a dict of similarity between segments where keys are index of segment in bursty_segments
"""
print('Computing similarity between bursty segments')
seg_sim = {}
bursty_segments = list(bursty_segment_weights.keys())
n = len(bursty_segments)

for i in range(n):
seg_sim[i] = {}
seg_sim[i][i] = 1

for i in range(n):
seg1_name = bursty_segments[i]
print(i+1, seg1_name, str(bursty_segment_weights[seg1_name])[:7])
for j in range(i+1, n):
seg2_name = bursty_segments[j]
sim = time_window.get_segment_similarity(seg1_name, seg2_name)
seg_sim[i][j] = sim
seg_sim[j][i] = sim

return seg_sim
27 changes: 27 additions & 0 deletions Segment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
class Segment:
def __init__(self,segment):
"""
segment - string of words ex. 'steve jobs'
"""
self.segment = segment
self.tweets = [] # list of tweets( text:str ) containing this segment in current time window

self.freq = 0 # tweet-freq i.e. number of tweets containing this segment
self.user_set = set() # no. of unique users that used this segment in current time window
self.retweet_count = 0 # sum of retweet counts of all tweets containing this segment
self.followers_count = 0 # sum of followers count of all users using this segment
self.newsworthiness = 0 # measure of importance of segment calculated by Twevent's Q(s) values

def __str__(self):
return 'Segment:'+self.segment+', freq:'+str(self.freq)+', user_count:'+str(self.get_user_count())

def add_tweet(self, user_id, text, retweet_count, followers_count):
self.tweets.append(text)
self.user_set.add(user_id)
self.freq += 1
self.retweet_count += retweet_count
self.followers_count += followers_count

def get_user_count(self):
return len(self.user_set)

71 changes: 35 additions & 36 deletions src/TimeWindow.py → TimeWindow.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sklearn.feature_extraction.text import TfidfVectorizer

from Segment import *
from Segment import Segment


def tf_idf_sim(text1, text2):
Expand All @@ -11,6 +11,37 @@ def tf_idf_sim(text1, text2):
except:
return 0


class SubWindow:
time_frame_counter = 0 # static var

def __init__(self,segments, tweet_count):
"""
segments is dict of Segment class objects indexed by segment name ex. 'selena gomez'
tweet_count is number of tweets in this subwindow from which the segments are extracted
"""
SubWindow.time_frame_counter += 1

self.time_frame = SubWindow.time_frame_counter # unique time frame to each sub window starting from 1
self.segments = segments
self.tweet_count = tweet_count

def __str__(self):
result = 'SubWindow #'+str(self.time_frame)+', No. of Tweets: '+str(self.tweet_count)
return result

def get_tweets_containing_segment(self,segment):
return self.segments[segment].tweets

def get_freq_of_segment(self, segment):
return self.segments[segment].freq

def get_user_count_for_segment(self, segment):
return self.segments[segment].get_user_count()

########## END OF CLASS SubWindow ##########


class TimeWindow:

def __init__(self, initial_subwindows):
Expand Down Expand Up @@ -49,8 +80,6 @@ def get_tweets_containing_segment(self, seg_name):

def advance_window(self, next_subwindow):
print('Advancing Time Window')
removed_subwindow = self.subwindows[0]

self.subwindows = self.subwindows[1:]
self.subwindows.append(next_subwindow)
self.start_frame += 1
Expand All @@ -67,8 +96,8 @@ def get_segment_similarity(self, s1_name, s2_name):
similarity = 0
for sw in self.subwindows:

s1 = sw.segments.get(s1_name,None)
s2 = sw.segments.get(s2_name,None)
s1 = sw.segments.get(s1_name, None)
s2 = sw.segments.get(s2_name, None)

if not s1 == None: s1_freq += s1.freq
if not s2 == None: s2_freq += s2.freq
Expand All @@ -80,34 +109,4 @@ def get_segment_similarity(self, s1_name, s2_name):
similarity = similarity/(s1_freq * s2_freq)
return similarity


########## END OF CLASS TimeWindow ##########

class SubWindow:
time_frame_counter = 0 # static var

def __init__(self,segments, tweet_count):
"""
segments is dict of Segment class objects indexed by segment name ex. 'selena gomez'
tweet_count is number of tweets in this subwindow from which the segments are extracted
"""
SubWindow.time_frame_counter += 1

self.time_frame = SubWindow.time_frame_counter # unique time frame to each sub window starting from 1
self.segments = segments
self.tweet_count = tweet_count

def __str__(self):
result = 'SubWindow #'+str(self.time_frame)+', No. of Tweets: '+str(self.tweet_count)
return result

def get_tweets_containing_segment(self,segment):
return self.segments[segment].tweets

def get_freq_of_segment(self, segment):
return self.segments[segment].freq

def get_user_count_for_segment(self, segment):
return self.segments[segment].get_user_count()

########## END OF CLASS SubWindow ##########
########## END OF CLASS TimeWindow ##########
120 changes: 2 additions & 118 deletions src/TweetSegmenter.py → TweetSegmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,127 +104,11 @@ def tweet_segmentation(self, json_tweet):

return segmentation

##################### END OF CLASS WikiTweetSegmenter #########################

class TweventTweetSegmenter:

def __init__(self, wiki_prob_file='../data/WikiQsEng_non_zero_processed.json', n_gram_prob_file='../data/seg_prob_2012_Oct_11-22.json'):
"""
wiki_prob_file contains the probability that a phrase is anchor text among pages that contain the phrase (NER: Named Entity Recognition)
n_gram calculated by calling google_n_gram 2012 api so store to dict if searched for first time so to access faster again
"""
print('Initializing TweventTweetSegmenter')

f = open(wiki_prob_file,'r')
self.wiki_prob = json.load(f)
f.close()

f = open(n_gram_prob_file,'r')
self.n_gram_prob = json.load(f)
f.close()

print('TweventTweetSegmenter Ready\n')

def get_length_norm(self,segment):
l = len(segment)
if l == 1: return 1
else: return (l-1)/l

def get_scp_score(self, segment):
"""
segments: a list of words
return the scp score(cohesiveness) given one segment
"""
if len(segment) == 0:
return
if len(segment) == 1:
return 2 * math.log(self.get_n_gram_probability(segment))
pr_s = self.get_n_gram_probability(segment) # joint prob of segment s
n = len(segment)
prob_sum = 0
for i in range(1, n):
pr1 = self.get_n_gram_probability(segment[0:i])
pr2 = self.get_n_gram_probability(segment[i:n])
prob_sum += pr1 * pr2

avg = prob_sum/(n-1)
scp = math.log((pr_s**2)/avg)
return scp

def get_stickiness(self, segment):
"""
given one segment
return the stickness score
"""
scp_score = self.get_scp_score(segment)
l_norm = self.get_length_norm(segment)
wiki_prob = self.get_wiki_prob(segment)
stickiness = l_norm * math.exp(wiki_prob) / (1 + math.exp(-scp_score))
# print('\n',segment)
# print('SCP:',1/ (1 + math.exp(-scp_score)))
# print('wiki:',math.exp(wiki_prob))
# print('stickiness:',stickiness)
return stickiness

def get_n_gram_probability(self, segment):
"""
given a list of words as segment,
return the prior probability of the segment
"""
phrase = ' '.join(segment)
if phrase in self.n_gram_prob: return self.n_gram_prob[phrase]
else: return 1/10**9

def get_wiki_prob(self,segment):
"""
segment in the form of list example ['south','america']
if present in dict then return value else return 0
"""
segment = ' '.join(segment) # convert from list of words to string
try:
return self.wiki_prob[segment]
except:
return 0

def text_segmentation(self, tweet_text, max_segment_len=3, e = 5):
"""
Using Dynamic Probability
Break sentence into segments such that sum of stickiness scores of segments is maximised
"""
words = tweet_text.split(' ')
n = len(words)

S = [[] for i in range(0,n)] ## S[i] = top e possible segmentations of first 'i' words
for i in range(0, n):
if i < max_segment_len:
S[i].append( ([words[0:i+1]], self.get_stickiness(words[0:i+1])) )

j = i
while j>=1 and i-j+1<=max_segment_len:
t2 = words[j:i+1]
for segment in S[j-1]:
new_seg = []
for s in segment[0]:
new_seg.append(s)

new_seg.append(t2)
S[i].append((new_seg, self.get_stickiness(t2) + segment[1])) # segment[1] is its stickness

S[i] = sorted(S[i], key = lambda x: x[1], reverse=True)[0:e]
j -= 1
#print(S[i])
return S[n-1][0][0]

##################### END OF CLASS TweventTweetSegmenter ######################

if __name__ == '__main__':
#segmenter = TweventTweetSegmenter()
segmenter = SEDTWik(wiki_titles_file = '../data/enwiki-titles-unstemmed.txt')
segmenter = SEDTWikSegmenter(wiki_titles_file='data/enwiki-titles-unstemmed.txt')

while True:
print('Enter Tweet to segment it ("x" to exit)...')
tweet_text = str(input())
if tweet_text =='x': break
print('Segmentation:',segmenter.text_segmentation(tweet_text),'\n')

print('\nEXITED')
print('Segmentation:', segmenter.text_segmentation(tweet_text),'\n')
15 changes: 6 additions & 9 deletions src/TwitterEventDetector.py → TwitterEventDetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,19 @@
import os
import sys

from BurstySegmentExtractor import *
from EventSegmentClusterer import *
from Segment import *
from TimeWindow import *
from BurstySegmentExtractor import BurstySegmentExtractor
from Segment import Segment
from TimeWindow import SubWindow
from TweetSegmenter import SEDTWikSegmenter
from utils.pyTweetCleaner import *
from utils.pyTweetCleaner import TweetCleaner


class TwitterEventDetector():

def __init__(self, wiki_titles_file, seg_prob_file, wiki_Qs_file, remove_retweets=False, max_segment_length=4, hashtag_wt=3,
use_retweet_count=True, use_followers_count=True, default_seg_prob=0.000001, entities_only=False):

self.segmenter = SEDTWikSegmenter(wiki_titles_file, max_segment_length, hashtag_wt, entities_only)

self.remove_retweets = remove_retweets

self.bse = BurstySegmentExtractor(seg_prob_file, use_retweet_count, use_followers_count, default_seg_prob)

# prob that a segment is anchor text in all pages containing that segment
Expand All @@ -30,12 +26,13 @@ def __init__(self, wiki_titles_file, seg_prob_file, wiki_Qs_file, remove_retweet
def clean_tweets_in_directory(self, root_dir, target_dir):
"""
clean tweets in root_dir using pyTweetCleaner and save cleaned files in target_dir
This need to be done just once and then the cleaned tweets can be used afterward
"""
print('Cleaning all tweets in given directory')
tc = TweetCleaner(True, self.remove_retweets)

if not os.path.isdir(target_dir): os.mkdir(target_dir)
for dir_path, sub_dir_list, file_list in os.walk(root_dir):
for dir_path, _, file_list in os.walk(root_dir):
dir_path = dir_path.replace('\\','/') # make windows-like path to unix-like path which can be used for both
dir_name = dir_path.replace(root_dir,'')
print('Found directory: %s' % dir_name)
Expand Down
Loading

0 comments on commit fb4b00f

Please sign in to comment.