-
Notifications
You must be signed in to change notification settings - Fork 6
/
distance_calc.php
executable file
·78 lines (64 loc) · 1.93 KB
/
distance_calc.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env php
<?php
// Compute a distance matrix for use by Neighbor Joining phylogeny generator.
// You give this program a list of file names on the command line.
// It computes Normalized Compression Distance for all pairs of files.
// Output is on stdout, in the form of a Comma Separated Value file.
$string_lengths = array();
$filenames = array();
$names = array();
$strings = array();
array_shift($argv);
$a = array_chunk($argv, 2);
$max = count($a);
for ($i = 0; $i < $max; ++$i) {
$filenames[$i] = $a[$i][0];
$names[$i] = $a[$i][1];
$strings[$i] = file_get_contents($filenames[$i]);
}
$comma = '';
for ($i = 0; $i < $max; ++$i) {
printf("$comma\"%s\"", $names[$i]);
$comma = ',';
}
echo "\n";
for ($i = 0; $i < $max; ++$i) {
$comma = '';
for ($j = 0; $j < $max; ++$j) {
if ($j >= $i) {
$d = NCD($strings[$i], $strings[$j]);
printf("$comma%.4f", $d);
} else
print($comma);
$comma = ',';
}
echo "\n";
}
exit;
// This is the NCD algorithm from "COMMON PITFALLS USING THE NORMALIZED
// COMPRESSION DISTANCE: WHAT TO WATCH OUT FOR IN A COMPRESSOR", by
// MANUEL CEBRIAN, MANUEL ALFONSECA, AND ALFONSO ORTEGA
// http://www.ims.cuhk.edu.hk/~cis/2005.4/01.pdf
// I have it doing bzcompress() based on the recommanedations of that
// article, although I don't think that the size of the WSO files is
// large enough to trip block size problems.
function NCD($stringX, $stringY)
{
global $string_lengths;
if (array_key_exists($stringX, $string_lengths))
$Zx = $string_lengths[$stringX];
else {
$Zx = strlen(bzcompress($stringX));
$string_lengths[$stringX] = $Zx;
}
if (array_key_exists($stringY, $string_lengths))
$Zy = $string_lengths[$stringY];
else {
$Zy = strlen(bzcompress($stringY));
$string_lengths[$stringY] = $Zy;
}
$Zxy = strlen(bzcompress($stringX.$stringY));
$Zyx = strlen(bzcompress($stringY.$stringX));
$e1 = max($Zxy - $Zx, $Zyx - $Zy)/max($Zx, $Zy);
return $e1;
}