function labels = SVD2( tokens, N_types, freqs, svd_R1, cl_num_1, svd_R2, cl_num_2) % SVD2 Algorithm % If you use this code, please cite: % Michael Lamar*, Yariv Maron*, Mark Johnson, Elie Bienenstock. % SVD and clustering for unsupervised POS tagging. (ACL 2010) % For more information, see the above reference. % Input: % tokens - The corpus, as a sequence of word id's. % N_types - Number of word types in corpus. This is also the size of unique(tokens) % freqs - Frequencies of word types. size: N_types doubles. % svd_R1 - Parameter to algorithm: dimensions of Reduced rank SVD in first pass. % cl_num_1- Parameter to algorithm: number of clusters in first pass. % svd_R2 - Parameter to algorithm: dimensions of Reduced rank SVD in second pass. % cl_num_2- Parameter to algorithm: number of clusters in second pass. % Inline function make_unit scales rows to unit length. make_unit = @(x) repmat(1./sqrt(sum(x.^2,2)),1,size(x,2)).*x; % scales to 1 each row of x %% FIRST PASS: compute svd of bigram matrix and cluster % Build bigram matrix from corpus a = tokens(1: (end-1)); b = tokens(2: end ); bigrams = sparse(a, b, 1, N_types, N_types); clear a b context_vec_size = 1000; % length of context vectors. display(['FIRST PASS length of context vectors: ' num2str(context_vec_size)]) display([' SVD rank-reduced to: ' num2str(svd_R1)]) right_context = full(bigrams(1:N_types, 1:context_vec_size)); [U,S,V] = svd(right_context, 'econ'); clear right_context U = U(:,1:svd_R1); S = S(1:svd_R1,:); right_svd = make_unit(U*S); clear U S V left_context = full(bigrams(1:context_vec_size, 1:N_types)); [U,S,V] = svd(left_context', 'econ'); clear bigrams left_context U = U(:,1:svd_R1); S = S(1:svd_R1,:); left_svd = make_unit(U*S); clear U S V RR_SVD_1 = [left_svd right_svd]; % reduced-rank SVD descriptors for Pass 1 clear left_svd right_svd display([' allocated number of clusters: ' num2str(cl_num_1)]) tic [T, temp, temp, lq, iter_cnt] = WKM2(RR_SVD_1, freqs, cl_num_1, RR_SVD_1(1:cl_num_1,:), 0); toc clear RR_SVD_1 N_used_cluster_1 = sum(lq > 0); % display([' number of used clusters: ' num2str(N_used_cluster_1)]) %% SECOND PASS: compute generalized context vectors T_types = T(tokens); % relabel the corpus with labels from first pass clear T; display(['SECOND PASS SVD rank-reduced to: ' num2str(svd_R2)]) % Right context vectors (second pass) a = tokens(1: (end-1)); b = T_types(2: end ); bigrams = full(sparse(a, b, 1, N_types, cl_num_1)); clear a b [U,S,V] = svd(bigrams, 'econ'); U = U(:,1:svd_R2); S = S(1:svd_R2,:); right_gen = make_unit(U*S); clear bigrams U S V % Left context vectors (second pass) a = tokens(2: end ); b = T_types(1: (end-1)); bigrams = full(sparse(a, b, 1, N_types, cl_num_1)); clear a b [U,S,V] = svd(bigrams, 'econ'); U = U(:,1:svd_R2); S = S(1:svd_R2,:); left_gen = make_unit(U*S); clear bigrams U S V RR_SVD_2 = [left_gen right_gen]; % reduced-rank SVD descriptors for Pass 2 clear left_gen right_gen %% Compute final labels [C, Q, mu, lq, iter_cnt] = WKM2(RR_SVD_2, freqs, cl_num_2, RR_SVD_2(1:cl_num_2,:), 0); % C is the cluster id of every word type % label the corpus (tokens sequence) labels = C(tokens); disp(['********************** END ' datestr(now) ' **********************']) fprintf(sprintf('\n')) diary off