% If you use this code, please cite: % Michael Lamar*, Yariv Maron*, Mark Johnson, Elie Bienenstock. % SVD and clustering for unsupervised POS tagging. (ACL 2010) % For more information, see the above reference. function [T, Q, mu, lq, iter_cnt] = WKM2(X, w, k, mu, switch_thresh_pct) % weighted K-means on 2-sphere % Each input vector is on the 2-sphere, % i.e., it is a concatenation of two unit vectors. % Correspondingly, the centroids are also on the 2-sphere. % INPUTS % X = matrix of data vectors (each row of X is one datum) % w = vector of weights to assigned to data (length(w) = size(X,1)) % k = desired number of clusters % mu = initial centroids of clusters (optional) % OUTPUTS % T = column vector containing the cluster number of each vector in X % Q = cell of cluster assignments (size is k). Each entry in the cell % contains the list of vectors assigned to that cluster % mu = matrix of cluster (weighted) centroids % lq = number of vectors in each of the clusters % iter_cnt = number of iterations to converge make_unit = @(x) repmat(1./sqrt(sum(x.^2,2)),1,size(x,2)).*x; %turns matrix of row vectors into matrix of unit row vectors if nargin < 3 % make sure there are enough inputs error('must specify three or more inputs') end [nr, nc] = size(X); % nr = num feature vects, nc = length of a feature vector w = w(:); % make sure weights form a column vector nd = nc/2; % number of dimensions of each unit vector if nr ~= numel(w) % make sure there is one weight per feature vector error('mismatch in size of inputs') end if ~exist('mu','var') % check to see if centroids were initialized mu = []; end if isempty(mu) % if they weren't provided, make them random mu = [make_unit(randn(k, nd)) make_unit(randn(k, nd))]; end if ~exist('switch_thresh_pct','var') % if no threshold given switch_thresh_pct = 1e-3; % make it 0.001 end if ~all(size(mu) == [k nc]) % make sure centroids have the right size disp([size(mu) k nc]) error('centroid matrix has wrong size') end w_rep = repmat(w, 1,nc); X_w = X.*w_rep; % weighted vectors go_on = true; % rate of change of cluster assignment is above threshold iter_cnt = 0; % count the iterations Q = cell(k, 1); % cluster assignments right_half = 1:nd; % indices of the right half of vector left_half = nd+1:nc; % left half Q_tmp = sparse(nr ,k); % stores temporary cluster assignment: % Q_tmp_new(s,t) is 1 iff vector s is in cluster t. while go_on iter_cnt = iter_cnt + 1; % Step 1: assign each vector to best cluster, i.e., with closest centroid fits = X*mu.'; % score of every vector for every cluster [best_fitness, best_cluster_num] = max(fits,[],2); % best cluster Q_tmp_new = sparse(1:nr, best_cluster_num, 1, nr, k); % new cluster assignment switches = nnz(Q_tmp - Q_tmp_new)/2; % number of vectors that switched Q_tmp = Q_tmp_new; % out with the old, in with the new go_on = switches > switch_thresh_pct*nr; % if few enough have switched, stop % Step 2: recompute centroids Y = Q_tmp.'*X_w; % new centroids mu = [make_unit(Y(:,right_half)) make_unit(Y(:,left_half))]; % put them on 2-sphere end for i = 1:nr my_type = best_cluster_num(i); Q{my_type} = [Q{my_type} i]; end lq = zeros(1,k); % number of vectors assigned to each of the k clusters for i = 1:k lq(i)=length(Q{i}); end % Throw out unused clusters: Qpos = lq > 0; % indices of clusters with at least one vector cnt = 0; filled_Qs = sum(Qpos); % how many of the k clusters are used Q1 = cell(filled_Qs,1); % new cell array mu1 = zeros(filled_Qs,2*nd); % new matrix of centroids T = zeros(size(X,1),1); for i = 1:k % loop on all clusters if Qpos(i) % if ith cluster is non-empty cnt = cnt + 1; % move to next Q1 cluster Q1{cnt} = Q{i}; % fill it with the vectors in Q{i} mu1(cnt,:) = mu(i,:); % also store the centroid T(Q1{cnt},1) = cnt; end end mu = mu1; % replace the centroids Q = Q1; % and the clusters lq = lq(Qpos).'; % keep only the lengths that were positive (and make it column)