-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_data.py
More file actions
145 lines (120 loc) · 4.48 KB
/
extract_data.py
File metadata and controls
145 lines (120 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#coding -*- utf-16 _*-
import os
import sys
import time
import glob
import codecs
import datetime
import argparse
import sqlite3
import numpy as np # get it at: http://numpy.scipy.org/
from csv import DictWriter
from shutil import copyfile
# imports specific to the MSD
# we define this very useful function to iterate the files
def apply_to_all_files(basedir,func=lambda x: x,ext='.h5'):
cnt = 0
# iterate over all files in all subdirectories
for root, dirs, files in os.walk(basedir):
files = glob.glob(os.path.join(root,'*'+ext))
# count files
cnt += len(files)
# apply function to all files
for f in files :
func(f)
return cnt
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--msd", dest="msd")
parser.add_argument("--msdtoolkit", dest="msdt")
parser.add_argument("--tagtraum", dest="tagtraum")
parser.add_argument("--mxm", dest="mxm")
parser.add_argument("--cs", dest="commonsongs")
args = parser.parse_args()
if args.msd:
msd_subset_path = args.msd
else:
msd_subset_path='/home/rcrimi/corpusMusic/MillionSongSubset'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check
if args.msdt:
msd_code_path = args.msdt
else:
msd_code_path='/home/rcrimi/Documents/MSongsDB'
assert os.path.isdir(msd_code_path),'wrong path' # sanity check
sys.path.append( os.path.join(msd_code_path,'PythonSrc'))
if args.tagtraum:
tagtraum_path = args.tagtraum
else:
tagtraum_path = "msd_tagtraum_cd2.cls"
assert os.path.isfile(tagtraum_path)
if args.mxm:
mxm_path = args.mxm
else:
mxm_path = '/home/rcrimi/corpusMusic/MillionSongSubset/mxm_dataset.db'
assert os.path.isfile(mxm_path)
if args.commonsongs:
commonsongs_path = args.commonsongs
else:
commonsongs_path = "common_songs.txt"
import hdf5_getters as GETTERS
full_data = [["track_id", "genre", "lyrics"]]
tagtraum_dict = {}
with open(tagtraum_path) as t:
tagtraum_data = t.readlines()
for line in tagtraum_data:
d = line.split("\t")
tagtraum_dict[d[0]] = d[1:]
with open(commonsongs_path) as f:
data = f.readlines()
for track_id in data:
track_id = track_id.rstrip("\n")
filename = '/home/rcrimi/corpusMusic/MillionSongSubset/data/'
filename += "/".join(list(track_id[2:5]))+"/"
filename += track_id + ".h5"
if os.path.isfile(filename):
#copyfile(filename, "/home/rcrimi/corpusMusic/MillionSongSubset/subset/%s"%(track_id+".h5"))
l = [track_id]
genres = tagtraum_dict[track_id.rstrip("\n")]
l.append("|".join([g.rstrip("\n") for g in genres]))
conn = sqlite3.connect(mxm_path)
q = "SELECT * FROM lyrics WHERE track_id == '%s'" % track_id
res = conn.execute(q)
response = res.fetchall()
conn.close()
lyrics = ""
for word in response:
lyrics += word[2]+"|"
lyrics += str(word[3])+" "
l.append(lyrics)
full_data.append(l)
tmplist = np.array(full_data)
#outfile = codecs.open("otherdata.csv", mode="wb", encoding="utf-8")
with open('otherdata.csv', "w") as outfile:
for line in full_data:
line = ",".join(line)+"\n"
outfile.write(line)
#np.savetxt(outfile, tmplist, delimiter=",", fmt="%s")
'''
trackids = []
for track in data:
track_id = track.split("\t")[0]
trackids.append(track_id)
conn = sqlite3.connect(os.path.join(msd_subset_path, mxm_path))
# we build the SQL query
q = "SELECT DISTINCT track_id FROM lyrics WHERE track_id IN %s" % str(trackids).replace("[", "(").replace("]", ")")
# we query the database
t1 = time.time()
res = conn.execute(q)
all_artist_names_sqlite = res.fetchall()
t2 = time.time()
print 'all artist names extracted (SQLite) in:',strtimedelta(t1,t2)
# we close the connection to the database
conn.close()
# let's see some of the content
for k in range(len(all_artist_names_sqlite)):
print all_artist_names_sqlite[k][0]
'''
if __name__ == "__main__":
main()