Skip to content

Commit c23a0f6

Browse files
committed
minor performance optimizations
1 parent f7f5b95 commit c23a0f6

File tree

2 files changed

+29
-18
lines changed

2 files changed

+29
-18
lines changed

docs/DataFormat.md

+6
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,9 @@ which looks to be outdated
127127
package.json.{1..127}.tch
128128
setup.py.{1..127}.tch
129129

130+
## Update queue
131+
132+
Logs of processed projects are available in
133+
`da0_data/gitnub/list*` and `da0_data/gitnub/new*`.
134+
By checking dates on the log files, you can find
135+
when a project was updated.

oscar.py

+23-18
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,8 @@ def __iter__(self):
471471
... for line in Tree("954829887af5d9071aa92c427133ca2cdd0813cc"))
472472
True
473473
"""
474-
try:
475-
data = self.data
476-
except ObjectNotFound:
477-
data = ''
474+
data = self.data
475+
478476
i = 0
479477
while i < len(data):
480478
# mode
@@ -592,7 +590,7 @@ def blobs(self):
592590
(<Blob: 2bdf5d686c6cd488b706be5c99c3bb1e166cf2f6>, ...,
593591
<Blob: c006bef767d08b41633b380058a171b7786b71ab>)
594592
"""
595-
return (Blob(sha) for sha in self.files.values())
593+
return (Blob(sha) for sha in self.blob_shas)
596594

597595

598596
class Commit(GitObject):
@@ -759,7 +757,8 @@ def children(self):
759757
def blob_shas(self):
760758
""" SHA hashes of all blobs in the commit
761759
762-
>>> Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blob_shas # doctest: +NORMALIZE_WHITESPACE
760+
>>> Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blob_shas
761+
... # doctest: +NORMALIZE_WHITESPACE
763762
('b2f49ffef1c8d7ce83a004b34035f917713e2766',
764763
'c92011c5ccc32a9248bd929a6e56f846ac5b8072',
765764
'bf3c2d2df2ef710f995b590ac3e2c851b592c871')
@@ -786,7 +785,8 @@ def blob_shas_rel(self):
786785
def blobs(self):
787786
""" A generator of `Blob` objects included in this commit
788787
789-
>>> tuple(Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blobs) # doctest: +NORMALIZE_WHITESPACE
788+
>>> tuple(Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blobs)
789+
... # doctest: +NORMALIZE_WHITESPACE
790790
(<Blob: b2f49ffef1c8d7ce83a004b34035f917713e2766>,
791791
<Blob: c92011c5ccc32a9248bd929a6e56f846ac5b8072>,
792792
<Blob: bf3c2d2df2ef710f995b590ac3e2c851b592c871>)
@@ -840,8 +840,8 @@ def __iter__(self):
840840
True
841841
"""
842842
for sha in self.commit_shas:
843-
c = Commit(sha)
844843
try:
844+
c = Commit(sha)
845845
author = c.author
846846
except ObjectNotFound:
847847
continue
@@ -880,7 +880,8 @@ def all(cls, name_prefix=''):
880880
def commit_shas(self):
881881
""" SHA1 of all commits in the project
882882
883-
>>> Project('user2589_django-currencies').commit_shas # doctest: +NORMALIZE_WHITESPACE
883+
>>> Project('user2589_django-currencies').commit_shas
884+
... # doctest: +NORMALIZE_WHITESPACE
884885
('2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7',
885886
'7572fc070c44f85e2a540f9a5a05a95d1dd2662d')
886887
"""
@@ -893,7 +894,8 @@ def commits(self):
893894
It has the same effect as iterating a `Project` instance itself,
894895
with some additional validation of commit dates.
895896
896-
>>> tuple(Project('user2589_django-currencies').commits) # doctest: +NORMALIZE_WHITESPACE
897+
>>> tuple(Project('user2589_django-currencies').commits)
898+
... # doctest: +NORMALIZE_WHITESPACE
897899
(<Commit: 2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7>,
898900
<Commit: 7572fc070c44f85e2a540f9a5a05a95d1dd2662d>)
899901
"""
@@ -975,19 +977,22 @@ def commits_fp(self):
975977
# simplified version (argmax): ~153 seconds
976978
# self.head(): ~190 seconds
977979

978-
# Sometimes (very rarely) commit dates are wrong, so the latest commit
979-
# is not actually the head. The magic below is to account for this
980+
# at this point we know all commits are in the dataset
981+
# (validated in __iter___)
980982
commits = {c.sha: c for c in self.commits}
981983
commit = max(commits.values(), key=lambda c: c.authored_at or DAY_Z)
982984
while commit:
985+
try: # here there is no guarantee commit is in the dataset
986+
first_parent = commit.parent_shas and commit.parent_shas[0]
987+
except ObjectNotFound:
988+
break
989+
983990
yield commit
984-
if not commit.parent_shas:
991+
992+
if not first_parent:
985993
break
986-
if commit.parent_shas[0] in commits:
987-
# save a bit of time on instantiation
988-
commit = commits[commit.parent_shas[0]]
989-
else:
990-
commit = Commit(commit.parent_shas[0])
994+
995+
commit = commits.get(first_parent, Commit(first_parent))
991996

992997

993998
class File(_Base):

0 commit comments

Comments
 (0)