CHiME3_simulate_data_patched_parallel.m 13.4 KB
function CHiME3_simulate_data_patched_parallel(official,nj,chime4_dir,chime3_dir)

% CHIME3_SIMULATE_DATA Creates simulated data for the 3rd CHiME Challenge
%
% CHiME3_simulate_data
% CHiME3_simulate_data(official)
%
% Input:
% official: boolean flag indicating whether to recreate the official
% Challenge data (default) or to create new (non-official) data
%
% If you use this software in a publication, please cite:
%
% Jon Barker, Ricard Marxer, Emmanuel Vincent, and Shinji Watanabe, The
% third 'CHiME' Speech Separation and Recognition Challenge: Dataset,
% task and baselines, submitted to IEEE 2015 Automatic Speech Recognition
% and Understanding Workshop (ASRU), 2015.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
%                Inria (Emmanuel Vincent)
%                Mitsubishi Electric Research Labs (Shinji Watanabe)
% This software is distributed under the terms of the GNU Public License
% version 3 (http://www.gnu.org/licenses/gpl.txt)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

utils_folder = sprintf('%s/tools/utils', chime4_dir);
enhancement_folder = sprintf('%s/tools/enhancement/', chime3_dir);
addpath(utils_folder,'-end');
addpath(enhancement_folder);
sim_folder = sprintf('%s/tools/simulation', chime4_dir);
addpath(sim_folder);
upath = sprintf('%s/data/audio/16kHz/isolated/', chime4_dir);
cpath = sprintf('%s/data/audio/16kHz/embedded/', chime4_dir);
bpath = sprintf('%s/data/audio/16kHz/backgrounds/', chime4_dir);
apath = sprintf('%s/data/annotations/', chime4_dir);
upath_ext = 'local/nn-gev/data/audio/16kHz/isolated_ext/';
upath_simu = 'local/nn-gev/data/audio/16kHz/isolated/';
nchan=6;

% Define hyper-parameters
pow_thresh=-20; % threshold in dB below which a microphone is considered to fail
wlen_sub=256; % STFT window length in samples
blen_sub=4000; % average block length in samples for speech subtraction (250 ms)
ntap_sub=12; % filter length in frames for speech subtraction (88 ms)
wlen_add=1024; % STFT window length in samples for speaker localization
del=-3; % minimum delay (0 for a causal filter)

%% Create simulated training dataset from original WSJ0 data %%
if exist('equal_filter.mat','file'),
    load('equal_filter.mat');
else
    % Compute average power spectrum of booth data
    nfram=0;
    bth_spec=zeros(wlen_sub/2+1,1);
    sets={'tr05' 'dt05'};
    for set_ind=1:length(sets),
        set=sets{set_ind};
        mat=json2mat([apath set '_bth.json']);
        for utt_ind=1:length(mat),
            oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_BTH'];
	    fprintf('%s\n',[upath set '_bth/' oname '.CH0.wav']);
            o=audioread([upath set '_bth/' oname '.CH0.wav']);
            O=stft_multi(o.',wlen_sub);
            nfram=nfram+size(O,2);
            bth_spec=bth_spec+sum(abs(O).^2,2);
        end
    end
    bth_spec=bth_spec/nfram;
    
    % Compute average power spectrum of original WSJ0 data
    nfram=0;
    org_spec=zeros(wlen_sub/2+1,1);
    olist=dir([upath 'tr05_org/*.wav']);
    for f=1:length(olist),
        oname=olist(f).name;
        o=audioread([upath 'tr05_org/' oname]);
        O=stft_multi(o.',wlen_sub);
        nfram=nfram+size(O,2);
        org_spec=org_spec+sum(abs(O).^2,2);
    end
    org_spec=org_spec/nfram;
    
    % Derive equalization filter
    equal_filter=sqrt(bth_spec./org_spec);
    save('equal_filter.mat','equal_filter');
end
% Read official annotations
if official,
    mat=json2mat([apath 'tr05_simu.json']);
% Create new (non-official) annotations
else
    mat=json2mat([apath 'tr05_org.json']);
    ir_mat=json2mat([apath 'tr05_real.json']);
    for utt_ind=1:length(mat),
        oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_ORG'];
        osize=audioread([upath 'tr05_org/' oname '.wav'],'size');
        dur=osize(1)/16000;
        envirs={'BUS' 'CAF' 'PED' 'STR'};
        envir=envirs{randperm(4,1)}; % draw a random environment
        mat{utt_ind}.environment=envir;
        blist=dir([bpath '*' envir '.CH1.wav']);
        dur_diff=inf(1,length(ir_mat));
        for ir_ind=1:length(ir_mat),
            if strcmp(ir_mat{ir_ind}.environment,envir),
                ir_dur=ir_mat{ir_ind}.end-ir_mat{ir_ind}.start;
                dur_diff(ir_ind)=abs(ir_dur-dur);
            end
        end
        ir_ind=find(isinf(dur_diff));
        ir_ind=ir_ind(1);
        nfail=true;
        while nfail,
            bname=blist(randperm(length(blist),1)).name(1:end-8); % draw a random background recording
            mat{utt_ind}.noise_wavfile=bname;
            bsize=audioread([bpath bname '.CH1.wav'],'size');
            bdur=bsize(1)/16000;
            mat{utt_ind}.noise_start=ceil(rand(1)*(bdur-dur)*16000)/16000; % draw a random time
            mat{utt_ind}.noise_end=mat{utt_ind}.noise_start+dur;
            nname=mat{utt_ind}.noise_wavfile;
            nbeg=round(mat{utt_ind}.noise_start*16000)+1;
            nend=round(mat{utt_ind}.noise_end*16000);
            n=zeros(nend-nbeg+1,nchan);
            for c=1:nchan,
                n(:,c)=audioread([bpath nname '.CH' int2str(c) '.wav'],[nbeg nend]);
            end
            npow=sum(n.^2,1);
            npow=10*log10(npow/max(npow));
            nfail=any(npow<=pow_thresh); % check for microphone failure
        end
        xfail=true;
        while xfail,
            dur_diff(ir_ind)=inf;
            [~,ir_ind]=min(dur_diff); % pick impulse response from the same environment with the closest duration
            if dur_diff(ir_ind)==inf,
                keyboard;
            end
            mat{utt_ind}.ir_wavfile=ir_mat{ir_ind}.wavfile;
            mat{utt_ind}.ir_start=ir_mat{ir_ind}.start;
            mat{utt_ind}.ir_end=ir_mat{ir_ind}.end;
            iname=mat{utt_ind}.ir_wavfile;
            ibeg=round(mat{utt_ind}.ir_start*16000)+1;
            iend=round(mat{utt_ind}.ir_end*16000);
            x=zeros(iend-ibeg+1,nchan);
            for c=1:nchan,
                x(:,c)=audioread([cpath iname '.CH' int2str(c) '.wav'],[ibeg iend]);
            end
            xpow=sum(x.^2,1);
            xpow=10*log10(xpow/max(xpow));
            xfail=any(xpow<=pow_thresh); % check for microphone failure
        end
        mat{utt_ind}=orderfields(mat{utt_ind});
    end
    mat2json(mat,[apath 'tr05_simu_new.json']);
end

p = parpool('local', nj);
% Loop over utterances
parfor utt_ind=1:length(mat),
    if official,
        udir=[upath_simu 'tr05_' lower(mat{utt_ind}.environment) '_simu/'];
        udir_ext=[upath_ext 'tr05_' lower(mat{utt_ind}.environment) '_simu/'];
    else
        udir=[upath 'tr05_' lower(mat{utt_ind}.environment) '_simu_new/'];
    end
    if ~exist(udir,'dir'),
        system(['mkdir -p ' udir]);
    end
    if ~exist(udir_ext,'dir'),
        system(['mkdir -p ' udir_ext]);
    end
    oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_ORG'];
    iname=mat{utt_ind}.ir_wavfile;
    nname=mat{utt_ind}.noise_wavfile;
    uname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_' mat{utt_ind}.environment];
    ibeg=round(mat{utt_ind}.ir_start*16000)+1;
    iend=round(mat{utt_ind}.ir_end*16000);
    nbeg=round(mat{utt_ind}.noise_start*16000)+1;
    nend=round(mat{utt_ind}.noise_end*16000);

    % Load WAV files
    fprintf('%s\n',[upath 'tr05_org/' oname '.wav']);
    o=audioread([upath 'tr05_org/' oname '.wav']);
    [r,fs]=audioread([cpath iname '.CH0.wav'],[ibeg iend]);
    fprintf('%s\n',[cpath iname '.CH0.wav'],[ibeg iend]);
    x=zeros(iend-ibeg+1,nchan);
    n=zeros(nend-nbeg+1,nchan);
    for c=1:nchan,
    	fprintf('%s Place1\n',[cpath iname '.CH' int2str(c) '.wav']);
        x(:,c)=audioread([cpath iname '.CH' int2str(c) '.wav'],[ibeg iend]);
        n(:,c)=audioread([bpath nname '.CH' int2str(c) '.wav'],[nbeg nend]);
	fprintf('%s Place2\n',[bpath nname '.CH' int2str(c) '.wav']);
    end
   
    % Compute the STFT (short window)
    O=stft_multi(o.',wlen_sub);
    R=stft_multi(r.',wlen_sub);
    X=stft_multi(x.',wlen_sub);

    % Estimate 88 ms impulse responses on 250 ms time blocks
    A=estimate_ir(R,X,blen_sub,ntap_sub,del);

    % Derive SNR
    Y=apply_ir(A,R,del);
    y=istft_multi(Y,iend-ibeg+1).';
    SNR=sum(sum(y.^2))/sum(sum((x-y).^2));
    
    % Equalize microphone
    [~,nfram]=size(O);
    O=O.*repmat(equal_filter,[1 nfram]);
    o=istft_multi(O,nend-nbeg+1).';
    
    % Compute the STFT (long window)
    O=stft_multi(o.',wlen_add);
    X=stft_multi(x.',wlen_add);
    [nbin,nfram] = size(O);

    % Localize and track the speaker
    [~,TDOAx]=localize(X);
    
    % Interpolate the spatial position over the duration of clean speech
    TDOA=zeros(nchan,nfram);
    for c=1:nchan,
        TDOA(c,:)=interp1(0:size(X,2)-1,TDOAx(c,:),(0:nfram-1)/(nfram-1)*(size(X,2)-1));
    end
    
    % Filter clean speech
    Ysimu=zeros(nbin,nfram,nchan);
    for f=1:nbin,
        for t=1:nfram,
            Df=sqrt(1/nchan)*exp(-2*1i*pi*(f-1)/wlen_add*fs*TDOA(:,t));
            Ysimu(f,t,:)=permute(Df*O(f,t),[2 3 1]);
        end
    end
    ysimu=istft_multi(Ysimu,nend-nbeg+1).';

    % Normalize level and add
    ysimu=sqrt(SNR/sum(sum(ysimu.^2))*sum(sum(n.^2)))*ysimu;
    xsimu=ysimu+n;
    
    % Write WAV file
    for c=1:nchan,
        audiowrite([udir uname '.CH' int2str(c) '.wav'],xsimu(:,c),fs);
        audiowrite([udir_ext uname '.CH' int2str(c) '.Noise.wav'],n(:, c),fs);
        audiowrite([udir_ext uname '.CH' int2str(c) '.Clean.wav'],ysimu(:, c), fs);
    end
end

%% Create simulated development and test datasets from booth recordings %%
sets={'dt05' 'et05'};
for set_ind=1:length(sets),
    set=sets{set_ind};

    % Read official annotations
    if official,
        mat=json2mat([apath set '_simu.json']);
        
    % Create new (non-official) annotations
    else
        mat=json2mat([apath set '_real.json']);
        clean_mat=json2mat([apath set '_bth.json']);
        for utt_ind=1:length(mat),
            for clean_ind=1:length(clean_mat), % match noisy utterance with same clean utterance (may be from a different speaker)
                if strcmp(clean_mat{clean_ind}.wsj_name,mat{utt_ind}.wsj_name),
                    break;
                end
            end
            noise_mat=mat{utt_ind};
            mat{utt_ind}=clean_mat{clean_ind};
            mat{utt_ind}.environment=noise_mat.environment;
            mat{utt_ind}.noise_wavfile=noise_mat.wavfile;
            dur=mat{utt_ind}.end-mat{utt_ind}.start;
            noise_dur=noise_mat.end-noise_mat.start;
            pbeg=round((dur-noise_dur)/2*16000)/16000;
            pend=round((dur-noise_dur)*16000)/16000-pbeg;
            mat{utt_ind}.noise_start=noise_mat.start-pbeg;
            mat{utt_ind}.noise_end=noise_mat.end+pend;
            mat{utt_ind}=orderfields(mat{utt_ind}); 
        end
        mat2json(mat,[apath set '_simu_new.json']);
    end
    
    % Loop over utterances
    parfor utt_ind=1:length(mat),
        if official,
            udir=[upath_simu set '_' lower(mat{utt_ind}.environment) '_simu/'];
            udir_ext=[upath_ext set '_' lower(mat{utt_ind}.environment) '_simu/'];
        else
            udir=[upath set '_' lower(mat{utt_ind}.environment) '_simu_new/'];
        end
        if ~exist(udir,'dir'),
            system(['mkdir -p ' udir]);
        end
        if ~exist(udir_ext,'dir'),
            system(['mkdir -p ' udir_ext]);
        end
        oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_BTH'];
        nname=mat{utt_ind}.noise_wavfile;
        uname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_' mat{utt_ind}.environment];
        tbeg=round(mat{utt_ind}.noise_start*16000)+1;
        tend=round(mat{utt_ind}.noise_end*16000);
        
        % Load WAV files
        o=audioread([upath set '_bth/' oname '.CH0.wav']);
        [r,fs]=audioread([cpath nname '.CH0.wav'],[tbeg tend]);
        nsampl=length(r);
        x=zeros(nsampl,nchan);
        for c=1:nchan,
            x(:,c)=audioread([cpath nname '.CH' int2str(c) '.wav'],[tbeg tend]);
        end
        
        % Compute the STFT (short window)
        R=stft_multi(r.',wlen_sub);
        X=stft_multi(x.',wlen_sub);
        
        % Estimate 88 ms impulse responses on 250 ms time blocks
        A=estimate_ir(R,X,blen_sub,ntap_sub,del);
        
        % Filter and subtract close-mic speech
        Y=apply_ir(A,R,del);
        y=istft_multi(Y,nsampl).';
        level=sum(sum(y.^2));
        n=x-y;
        
        % Compute the STFT (long window)
        O=stft_multi(o.',wlen_add);
        X=stft_multi(x.',wlen_add);
        [nbin,nfram] = size(O);
        
        % Localize and track the speaker
        [~,TDOAx]=localize(X);
        
        % Interpolate the spatial position over the duration of clean speech
        TDOA=zeros(nchan,nfram);
        for c=1:nchan,
            TDOA(c,:)=interp1(0:size(X,2)-1,TDOAx(c,:),(0:nfram-1)/(nfram-1)*(size(X,2)-1));
        end

        % Filter clean speech
        Ysimu=zeros(nbin,nfram,nchan);
        for f=1:nbin,
            for t=1:nfram,
                Df=sqrt(1/nchan)*exp(-2*1i*pi*(f-1)/wlen_add*fs*TDOA(:,t));
                Ysimu(f,t,:)=permute(Df*O(f,t),[2 3 1]);
            end
        end
        ysimu=istft_multi(Ysimu,nsampl).';
        
        % Normalize level and add
        ysimu=sqrt(level/sum(sum(ysimu.^2)))*ysimu;
        xsimu=ysimu+n;
        
        % Write WAV file
        for c=1:nchan,
            audiowrite([udir uname '.CH' int2str(c) '.wav'],xsimu(:,c),fs);
            audiowrite([udir_ext uname '.CH' int2str(c) '.Noise.wav'],n(:, c),fs);
            audiowrite([udir_ext uname '.CH' int2str(c) '.Clean.wav'],ysimu(:, c), fs);
        end
    end
end
delete(p);
end