package main; our $SEE; package Jaccard_coefficient_cluster_resolver; use strict; BEGIN { use SingleLinkageClusterer; } =head1 NAME package Jaccard_coefficient_cluster_resolver =cut =head1 DESCRIPTION Module is used to resolve the clusters within a graph by using the Jaccard's similarity coefficient to break edges within loosely coupled clusters. The Jaccard's similarity coefficient is defined as: given nodes A and B, X = set of nodes connected to A including A Y = set of nodes connected to B including B #(nodes intersecting X and Y) (X && Y) Jlink (A,B) = ----------------------------- = ---------- #(nodes in set X union set Y) (X || Y) Jlink = 1 when nodes A and B are identically connected. Jlink = 0 when nodes A and B have no connected neighbors in common, and are themselves unconnected. For connected nodes A and B which have different neighbors, the Jlink score will be between 0 and 1, providing a similiarity coefficient for the level of similarity between sets of connections. Another description of the Jaccard similarity coefficient: (http://www.ergometrika.org/Volume3/mulqueen-rev-2003.htm) The Jaccard index was originally developed to assess similarity among distributions of flora in different geographic areas (Jaccard, 1912). The procedure results in a matching coefficient for binary variables in which joint absences are excluded from both the denominator and the numerator and equal weight is given to matches and non-matches: SJ = a/(a+b+c) x 100, where SJ = Jaccard similarity coefficient, a = number of elements shared by all groups, b = number of elements unique to the first group, and c = number of elements unique to the second group. Jaccard, P. (1912). The distribution of flora in the alpine zone. The New Phytologist, 11(2), 37-50. =cut =over 4 =item new() B Constructor: Instantiate an object of Jaccard_coefficient_cluster_resolver B $linkScore $linkScore is a real number between 0 and 1 defined as the Jaccard similarity coefficient. B $Jaccard_coefficient_cluster_resolver_object =back =cut sub new { my $packagename = shift; my $linkScore = shift; my $cluster_path = shift; # Utilizing public variable from SingleLinkageClusterer $SingleLinkageClusterer::CLUSTERPATH = $cluster_path; unless ($linkScore >= 0 && $linkScore <= 1) { die "Invalid link score ($linkScore). \n\n 0 <= link_score <= 1\n\n\n"; } my $self = { linkScore => $linkScore, }; bless ($self, $packagename); return ($self); } =over 4 =item resolve_clusters() B Given a set of paired elements, the edges between pairs are removed if the elements have a link score less than that set in the constructor. B @pairs @pairs is a list of paired elements in the form of array references. For example: @pairs = ( [ a, b], [b, c], [e, f] ) B @clusters @clusters is a list of array references where each array references provides a list of elements belonging to a single cluster. For example, given the inputted pairs above as input, @clusters = ( [a, b, c], [e, f], ) See SingleLinkageClusterer.pm for more details. =back =cut sub resolve_clusters { my $self = shift; my @pairs = @_; my $linkScore = $self->{linkScore}; ## Transform each pair to a data structure which provides sort of a graph, with each element (node) pointing to a list of other elements(nodes), implemented with a hash{node_id} = (node id list) my %inputGraph; foreach my $pair (@pairs) { my ($a, $b) = @$pair; my $a_aref = $inputGraph{$a}; unless ($a_aref) { $a_aref = $inputGraph{$a} = []; } my $b_aref = $inputGraph{$b}; unless ($b_aref) { $b_aref = $inputGraph{$b} = []; } &add_element($a_aref, $b); &add_element($b_aref, $a); } ## Now examine the Jaccard similarity coefficient between each member my @resolved_pairs; #all pairs meeting link score restrictions. my %seen; #avoid analyzing the same pairs twice. my @all_elements = keys %inputGraph; foreach my $element (@all_elements) { my $connected_elements_aref = $inputGraph{$element}; my @listA = ($element, @$connected_elements_aref); foreach my $connected_element (@$connected_elements_aref) { if ($seen{$element}->{$connected_element}) { next;} #avoid duplicate comparisons. my $neighbors_list_aref = $inputGraph{$connected_element}; my @listB = ($connected_element, @$neighbors_list_aref); my $current_link_score = &calculate_Jaccard_coeff(\@listA, \@listB); if ($current_link_score >= $linkScore) { #passed test push (@resolved_pairs, [$element, $connected_element]); } $seen{$element}->{$connected_element} = 1; $seen{$connected_element}->{$element} = 1; } } my @clusters; if (@resolved_pairs) { @clusters = &SingleLinkageClusterer::build_clusters(@resolved_pairs); } return (@clusters); } #private sub add_element { my ($list_aref, $element) = @_; my $has_element = 0; foreach my $existing_element (@$list_aref) { if ($element eq $existing_element) { $has_element = 1; last; } } if (! $has_element) { push (@$list_aref, $element); } } #private #### sub calculate_Jaccard_coeff { my ($a_list_aref, $b_list_aref) = @_; if ($SEE) { print "\nList a: (@$a_list_aref)\n" . "List b: (@$b_list_aref)\n"; } ## Track each element my (%all_entries, %a_entries, %b_entries); foreach my $element (@$a_list_aref) { $a_entries{$element} = 1; $all_entries{$element} = 1; } foreach my $element (@$b_list_aref) { $b_entries{$element} = 1; $all_entries{$element} = 1; } ## Determine number of unique entries. my @unique_entries = keys %all_entries; my $num_unique_entries = $#unique_entries + 1; print "$num_unique_entries unique entries between a and b: (@unique_entries)\n" if $SEE; ## See how many a and b have in common. my $num_in_common = 0; foreach my $element (@unique_entries) { if ($a_entries{$element} && $b_entries{$element}) { $num_in_common++; print "($element) is common.\n" if $SEE; } } my $link_score = ($num_in_common / $num_unique_entries); print "link score (a,b) = ($num_in_common / $num_unique_entries) = $link_score\n" if $SEE; return ($link_score); } 1; #EOM