MIDI and Lyrics Data Preprocess

将netease music中含有时间轴的歌词全部转移到统一的文件夹下

由于并不是所有的歌词信息都含有时间轴,因此我们需要将含有时间轴的歌词先抽取出来。

基于如下观察,我们可以发现,含有时间轴信息的歌词文件的开头如下:

1
"[00:00.00] 作词 : Jeff Keith/Frank Hannon\n[00:00.00] 作曲 : Jeff Keith/Frank Hannon\n[00:00.00] If you can  imagine this,\n[00:19.83]The whole world sharing one big kiss\n[00:26.52]These are thoughts all through my brain,that\n[00:30.24]

而不含有时间轴的歌词文件为如下格式:

1
"作词 : Hank Williams\n作曲 : Hank Williams\nGoodbye J

当然,还有一些歌词文件为空:

1
''

所以我们可以将"[0作为条件判断该歌词文件是否含有时间轴信息。

过滤的python script如下:

get_lyrcis_by_id.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
import shutil
import json
from datetime import datetime
from tqdm import tqdm

#change dir to current dir
current_path = os.path.dirname(os.path.abspath(__file__))
os.chdir(current_path)
print(current_path)


# Define the source and destination directories
source_dir = f'./lyrics'
destination_dir = f'./lyrics with time stamp'

# Create the destination directory with a timestamp
destination_dir = os.path.join(destination_dir)
os.makedirs(destination_dir, exist_ok=True)

# Function to check if the file starts with '[0'
def starts_with_0(filename):
try:
with open(filename, 'r', encoding='utf-8') as file:
data = json.load(file)
return str(data).startswith('[0') and data != ''
except json.JSONDecodeError:
return False

# Iterate over files in the source directory
for filename in tqdm(os.listdir(source_dir)):
if filename.endswith('.json'):
full_path = os.path.join(source_dir, filename)
if starts_with_0(full_path):
shutil.copy(full_path, destination_dir)

print("Files copy successfully.")

之后一共得到了19000多条带有时间轴的数据(一共40000多条数据,netease music上有歌词的就24000条,再滤掉没有时间轴的就只剩19000条了,太难啦,之后还得试试spotify,把spotify的脚本完善一下了跑起来)

将MIDI文件与lyrics文件做对齐

接下来使用pretty_midi库尝试将MIDI文件按照lyrics中的时间轴进行切片,从而得到对齐的MIDI音符序列与歌词文本序列对。

从lyrics的json文件中抽取歌词与对应的时间戳

此处我是将原本的json文件修改为如下的格式:

1
2
3
4
5
6
[
{(start_time1, end_time1) : "lyrics1"},
{(start_time2, end_time2) : "lyrics2"},
... ...
{(start_time9, end_time9) : "lyrics9"},
]

由于tuple以及list是unhashable的,无法作为python中dict的key,因此将其改为"start_time--end_time"的字符串格式

1
2
3
4
5
6
[
{"start_time1--end_time1" : "lyrics1"},
{"start_time2--end_time2" : "lyrics2"},
... ...
{"start_time9--end_time9" : "lyrics9"},
]

以下是代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
import os
import json
from tqdm import tqdm


# change dir to current dir
current_path = os.path.dirname(os.path.abspath(__file__))
os.chdir(current_path)

min_length = 10

def time_to_seconds(time_str):
"""Convert time string in format MM:SS.mmm to seconds."""
minutes, seconds = map(float, time_str.split(':'))
return minutes * 60 + seconds

def parse_lyrics_seconds(lyrics_data, pop_num_check = 5):
# Split the data into lines
lines = lyrics_data.split('\n')

# Initialize an empty list to store the dictionaries
lyrics_list = []

# Variables to keep track of the previous and current timestamps in seconds
prev_time_sec = None
prev_lyric = None

# Regular expression pattern to match timestamps and lyrics
# time patterns as MM:SS.mmm, MM:SS.mm, MM:SS,mm or MM:SS
# lyric patterns as string
pattern = r'\[(\d+:\d+(?:[.,]\d+)?)](.*)'
# This pattern uses ?: within the milliseconds group to make it a non-capturing group. The ? after it makes the entire milliseconds part optional.

for line in lines:
match = re.match(pattern, line)
if match:
# print(match)
current_time = match.group(1).replace(',', '.') # change , to .
lyric = match.group(2).strip()

# Convert the current timestamp to seconds
current_time_sec = time_to_seconds(current_time)

# Only add to the list if there is a previous timestamp
if prev_time_sec is not None and prev_lyric is not None:
# Create a dictionary with the required format
time_range = f"{prev_time_sec}--{current_time_sec}"
lyrics_dict = {time_range: prev_lyric}
lyrics_list.append(lyrics_dict)

# Update the previous timestamp in seconds and previous lyric
prev_time_sec = current_time_sec
prev_lyric = lyric

# the last lyric
if prev_time_sec is not None and prev_lyric is not None:
time_range = f"{prev_time_sec}--{prev_time_sec + 10}"
lyrics_dict = {time_range: prev_lyric}
lyrics_list.append(lyrics_dict)

# check the first three elements are lyrics or not
for i in range(pop_num_check):
if lyrics_list:
lyric = list(lyrics_list[0].values())[0]
if "词" in lyric or "曲" in lyric:
lyrics_list.pop(0)

# check if the lyric is empty:
lyrics_list = [d for d in lyrics_list if list(d.values())[0] != ""]

return lyrics_list


# Define the source and destination directories

source_dir = f'./lyrics with time stamp'
destination_dir = f'./lyrics cut into list with time stamp more than {min_length} setences'

# Ensure destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Iterate over files in the source directory
for filename in tqdm(os.listdir(source_dir)):
if filename.endswith('.json'):
source_file_path = os.path.join(source_dir, filename)
destination_file_path = os.path.join(destination_dir, filename)

# Read the content of the source file
with open(source_file_path, 'r', encoding='utf-8') as file:
# print(filename)
data = json.load(file)
# Process the data
processed_data = parse_lyrics_seconds(data)

# Save the processed data to the destination file
if len(processed_data) > min_length:
with open(destination_file_path, 'w', encoding='utf-8') as file:
json.dump(processed_data, file, ensure_ascii=False, indent=4)

print("All files processed and saved.")

根据lyrics中的时间信息对midi文件进行分片

设计数据结构如下图所示:

1
2
3
4
5
6
7
8
9
{
"lyrics": "Just beat it, beat it, beat it",
"notes": [
["Piano", 0, [pitch, velocity, start_time, duration], [pitch, velocity, start_time, duration],.., [p, v, s, d]],
["Drum", 1, [pitch, velocity, start_time, duration], [pitch, velocity, start_time, duration],.., [p, v, s, d]],
...
["Violin", 6, [pitch, velocity, start_time, duration], [pitch, velocity, start_time, duration],.., [p, v, s, d]]
]
}

每一个片段为一个dict,包含两组kv对,第一组是歌词,key为字符串“lyrics”, value即是歌词的字符串。第二组是notes,他是一个list,每一个元素也是一个lIst,其中第一个元素是乐器名,第二个元素是乐器名对应的编号(该编号由pretty_midi提取得到,样例中的数字是随便写的,不代表真实编号),从第三个元素开始都是一个list,对应一个note,其中四元组分别是音高,响度,开始时间,持续时间。

之所以这样设计数据结构,是因为每一个片段的midi文件中,可能有不止一个乐器,而每个乐器对应一组音符,这里的想法是将这些信息全部抽取出来,之后每一个乐器作为输入数据中的一个channel输入给模型(具体设计暂定)

在代码实现中,将切好的midi片段也一并保存下来,即既保存片段的midi文件,亦保存其json文件。

对一段时间内的midi进行切片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def cut_one_piece_of_midi(original_midi : pretty_midi.PrettyMIDI, start_time, end_time, store_path=None):
# Load the MIDI file

# Create a new MIDI object
if store_path != None:
new_midi = pretty_midi.PrettyMIDI()
new_clip = {'notes': []}
# Process each instrument
for instrument in original_midi.instruments:
# Create a new instrument
instru_notes_list = []
new_instrument = pretty_midi.Instrument(program=instrument.program, is_drum=instrument.is_drum)
program_number = instrument.program
instrument_name = pretty_midi.program_to_instrument_name(program_number)
instru_notes_list.append(instrument_name)
instru_notes_list.append(str(program_number))
# Process each note
for note in instrument.notes:
if start_time <= note.start < end_time:
# Adjust note start and end times relative to the new start
if store_path != None:
new_note = pretty_midi.Note(velocity=note.velocity,
pitch=note.pitch,
start=note.start - start_time,
end=note.end - start_time
)
new_instrument.notes.append(new_note)

instru_notes_list.append([
str(round(note.pitch, 1)),
str(round(note.velocity, 1)),
str(round(note.start, 1)),
str(round(note.end - note.start, 1))
])

if len(instru_notes_list) > 2:
if store_path != None:
new_midi.instruments.append(new_instrument)
new_clip['notes'].append(instru_notes_list)
# Save the new MIDI file
if len(new_clip['notes']) > 0:
if store_path != None:
store_path = store_path + "\\" + str(start_time) + "_" + str(end_time) + ".mid"
new_midi.write(store_path)
return new_clip

return None

此处没有使用pretty_midi中的adjust直接对原midi文件进行时间调整从而切片,而是以音符为单位,搜索所有在这个时间段内部的音符,并按照不同的乐器分别存储起来,至于为什么要用str将音符的音高,响度,起止时间进行转换,这是为了最后的数据能够用json文件存储。

对整个midi文件进行切片

由于我们之前已经爬取好了带时间轴的歌词,并按照上述所说的json文件的格式进行组织,所以直接根据歌词中的时序信息,对相应的整个midi文件进行依次调取cut_one_piece即可(当然,此处的实现方式可以优化,只需要遍历一遍整个midi文件即可完成对所有时间序列的切片,但是懒得重写代码了,暂时就这样吧)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def cut_whole_midi(midi : pretty_midi.PrettyMIDI, lyrics : list, store_path=None):
whole_midi_clips = []
for lyric in lyrics:
current_clip = {}
start_time, end_time, lyric = extract_lyric(lyric)
current_clip['lyrics'] = lyric
current_midi_clip = cut_one_piece_of_midi(midi, start_time, end_time, store_path)
if current_midi_clip is not None:
current_clip['notes'] = current_midi_clip['notes']
whole_midi_clips.append(current_clip)
# else:
# # Handle the case where no MIDI clip is generated (e.g., log an error message)
# print(f"No MIDI clip generated for time range {start_time} to {end_time}")
return whole_midi_clips

def extract_lyric(lyrics : dict):
for times, lyric in lyrics.items():
start_time, end_time = times.split("--")
start_time = float(start_time)
end_time = float(end_time)
return start_time, end_time, lyric

之后还剩余一些文件遍历的代码,此处就不再贴出,与主题无关。

至此,完成了对midi与lyrics的对齐处理。以下是处理后得到的json文件的效果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
[
{
"lyrics": "There is freedom within",
"notes": [
[
"Electric Guitar (jazz)",
"26",
[
"65",
"127",
"13.7",
"0.1"
],
[
"70",
"127",
"13.7",
"0.1"
],
[
"65",
"127",
"13.9",
"0.2"
],
[
"70",
"127",
"13.9",
"0.3"
],
[
"63",
"127",
"14.3",
"0.1"
],
[
"63",
"127",
"15.2",
"0.1"
],
[
"67",
"127",
"15.2",
"0.1"
],
[
"55",
"127",
"14.7",
"0.8"
],
[
"48",
"127",
"14.7",
"0.8"
],
[
"60",
"127",
"14.7",
"1.0"
],
[
"63",
"127",
"15.4",
"0.2"
],
[
"67",
"127",
"15.4",
"0.2"
],
[
"48",
"127",
"15.8",
"0.2"
],
[
"55",
"127",
"15.8",
"0.2"
],
[
"60",
"127",
"15.8",
"0.2"
],
[
"55",
"127",
"16.1",
"0.8"
],
[
"48",
"127",
"16.1",
"0.8"
],
[
"60",
"127",
"16.1",
"1.0"
]
],
[
"Acoustic Guitar (steel)",
"25",
[
"65",
"127",
"13.7",
"0.1"
],
[
"70",
"127",
"13.7",
"0.1"
],
[
"65",
"127",
"13.9",
"0.2"
],
[
"70",
"127",
"13.9",
"0.3"
],
[
"63",
"127",
"14.3",
"0.1"
],
[
"63",
"127",
"15.2",
"0.1"
],
[
"67",
"127",
"15.2",
"0.1"
],
[
"55",
"127",
"14.7",
"0.8"
],
[
"48",
"127",
"14.7",
"0.8"
],
[
"60",
"127",
"14.7",
"1.0"
],
[
"63",
"127",
"15.4",
"0.2"
],
[
"67",
"127",
"15.4",
"0.2"
],
[
"48",
"127",
"15.8",
"0.2"
],
[
"55",
"127",
"15.8",
"0.2"
],
[
"60",
"127",
"15.8",
"0.2"
],
[
"55",
"127",
"16.1",
"0.8"
],
[
"48",
"127",
"16.1",
"0.8"
],
[
"60",
"127",
"16.1",
"1.0"
]
],
[
"Fretless Bass",
"35",
[
"34",
"127",
"14.3",
"0.4"
],
[
"36",
"127",
"14.6",
"2.5"
]
],

MIDI and Lyrics Data Preprocess
http://example.com/2023/11/20/MIDI-and-Lyrics-Data-Preprocess/
Author
iMusic
Posted on
November 20, 2023
Licensed under