make_musan.py 6.65 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178


#!/usr/bin/env python3
# Copyright 2015   David Snyder
#           2019   Phani Sankar Nidadavolu
# Apache 2.0.
#
# This file is meant to be invoked by make_musan.sh.

import os, sys, argparse
sys.path.append("steps/data/")
sys.path.insert(0, 'steps/')
import libs.common as common_lib

def get_args():
    parser = argparse.ArgumentParser(description="Create MUSAN corpus",
                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--use-vocals", type=str,
                        dest='use_vocals', default=True,
                        action=common_lib.StrToBoolAction,
                        choices=["true", "false"],
                        help='use vocals from the music corpus')
    parser.add_argument('--sampling-rate', type=int, default=16000,
                        help="Sampling rate of the source data. If a positive integer is specified with this option, "
                        "the MUSAN corpus will be resampled to the rate of the source data."
                        "Original MUSAN corpus is sampled at 16KHz. Defaults to 16000 Hz")
    parser.add_argument("in_dir", help="Input data directory")
    parser.add_argument("out_dir", help="Output data directory")

    print(' '.join(sys.argv))
    args = parser.parse_args()
    args = check_args(args)

    return args

def check_args(args):
    if not os.path.exists(args.in_dir):
        raise Exception('input dir {0} does not exist'.format(args.in_dir))
    if not os.path.exists(args.out_dir):
        print("Preparing {0}/musan...".format(args.out_dir))
        os.makedirs(args.out_dir)

    return args

def process_music_annotations(path):
    utt2spk = {}
    utt2vocals = {}
    lines = open(path, 'r').readlines()
    for line in lines:
        utt, genres, vocals, musician = line.rstrip().split()[:4]
        # For this application, the musican ID isn't important
        utt2spk[utt] = utt
        utt2vocals[utt] = vocals == "Y"
    return utt2spk, utt2vocals

def prepare_music(root_dir, use_vocals, sampling_rate):
    utt2vocals = {}
    utt2spk = {}
    utt2wav = {}
    num_good_files = 0
    num_bad_files = 0
    music_dir = os.path.join(root_dir, "music")
    for root, dirs, files in os.walk(music_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".wav"):
                utt = str(file).replace(".wav", "")
                utt2wav[utt] = file_path
            elif str(file) == "ANNOTATIONS":
                utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
                utt2spk.update(utt2spk_part)
                utt2vocals.update(utt2vocals_part)

    utt2spk_str = ""
    utt2wav_str = ""
    for utt in utt2vocals:
        if utt in utt2wav:
            if use_vocals or not utt2vocals[utt]:
                utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
                if sampling_rate == 16000:
                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
                else:
                    utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
            num_good_files += 1
        else:
            print("Missing file {}".format(utt))
            num_bad_files += 1
    print("In music directory, processed {} files; {} had missing wav data".format(
                                                    num_good_files, num_bad_files))
    return utt2spk_str, utt2wav_str


def prepare_speech(root_dir, sampling_rate):
    utt2spk = {}
    utt2wav = {}
    num_good_files = 0
    num_bad_files = 0
    speech_dir = os.path.join(root_dir, "speech")
    for root, dirs, files in os.walk(speech_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".wav"):
                utt = str(file).replace(".wav", "")
                utt2wav[utt] = file_path
                utt2spk[utt] = utt

    utt2spk_str = ""
    utt2wav_str = ""
    for utt in utt2spk:
        if utt in utt2wav:
            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
            if sampling_rate == 16000:
                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
            else:
                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
            num_good_files += 1
        else:
            print("Missing file {}".format(utt))
            num_bad_files += 1
    print("In speech directory, processed {} files; {} had missing wav data".format(
                                                    num_good_files, num_bad_files))
    return utt2spk_str, utt2wav_str


def prepare_noise(root_dir, sampling_rate):
    utt2spk = {}
    utt2wav = {}
    num_good_files = 0
    num_bad_files = 0
    noise_dir = os.path.join(root_dir, "noise")
    for root, dirs, files in os.walk(noise_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".wav"):
                utt = str(file).replace(".wav", "")
                utt2wav[utt] = file_path
                utt2spk[utt] = utt

    utt2spk_str = ""
    utt2wav_str = ""
    for utt in utt2spk:
        if utt in utt2wav:
            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
            if sampling_rate == 16000:
                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
            else:
                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
            num_good_files += 1
        else:
            print("Missing file {}".format(utt))
            num_bad_files += 1
    print("In noise directory, processed {} files; {} had missing wav data".format(
                                    num_good_files, num_bad_files))
    return utt2spk_str, utt2wav_str


def main():
    args = get_args()
    in_dir = args.in_dir
    out_dir = args.out_dir
    use_vocals = args.use_vocals
    sampling_rate = args.sampling_rate

    utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals, sampling_rate)
    utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, sampling_rate)
    utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, sampling_rate)

    utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
    utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
    wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
    wav_fi.write(utt2wav)
    utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
    utt2spk_fi.write(utt2spk)


if __name__=="__main__":
    main()