Blame


1 08708877 2003-11-25 devnull # when raw index has a lot of entries like
2 08708877 2003-11-25 devnull # 1578324 problematico, a, ci, che
3 08708877 2003-11-25 devnull # apply this algorithm:
4 08708877 2003-11-25 devnull # treat things after comma as suffixes
5 08708877 2003-11-25 devnull # for each suffix:
6 08708877 2003-11-25 devnull # if single letter, replace last letter
7 08708877 2003-11-25 devnull # else search backwards for beginning of suffix
8 08708877 2003-11-25 devnull # and if it leads to an old suffix of approximately
9 08708877 2003-11-25 devnull # the same length, put replace that suffix
10 08708877 2003-11-25 devnull # This will still leave some commas to fix by hand
11 08708877 2003-11-25 devnull # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
12 08708877 2003-11-25 devnull
13 08708877 2003-11-25 devnull NF == 2 {
14 08708877 2003-11-25 devnull i = index($2, ",")
15 08708877 2003-11-25 devnull if(i == 0 || length($2) == 0)
16 08708877 2003-11-25 devnull print $0
17 08708877 2003-11-25 devnull else {
18 08708877 2003-11-25 devnull n = split($2, a, /,[ ]*/)
19 08708877 2003-11-25 devnull w = a[1]
20 08708877 2003-11-25 devnull printf "%s\t%s\n", $1, w
21 08708877 2003-11-25 devnull for(i = 2; i <= n; i++) {
22 08708877 2003-11-25 devnull suf = a[i]
23 08708877 2003-11-25 devnull m = matchsuflen(w, suf)
24 08708877 2003-11-25 devnull if(m) {
25 08708877 2003-11-25 devnull nw = substr(w, 1, length(w)-m) suf
26 08708877 2003-11-25 devnull printf "%s\t%s\n", $1, nw
27 08708877 2003-11-25 devnull } else
28 08708877 2003-11-25 devnull printf "%s\t%s\n", $1, w ", " suf
29 08708877 2003-11-25 devnull }
30 08708877 2003-11-25 devnull }
31 08708877 2003-11-25 devnull }
32 08708877 2003-11-25 devnull NF != 2 {
33 08708877 2003-11-25 devnull print $0
34 08708877 2003-11-25 devnull }
35 08708877 2003-11-25 devnull
36 08708877 2003-11-25 devnull function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
37 08708877 2003-11-25 devnull {
38 08708877 2003-11-25 devnull wlen = length(w)
39 08708877 2003-11-25 devnull suflen = length(suf)
40 08708877 2003-11-25 devnull if(suflen == 1)
41 08708877 2003-11-25 devnull return 1
42 08708877 2003-11-25 devnull else {
43 08708877 2003-11-25 devnull c = substr(suf, 1, 1)
44 08708877 2003-11-25 devnull for (k = 1; k <= wlen ; k++)
45 08708877 2003-11-25 devnull if(substr(w, wlen-k+1, 1) == c)
46 08708877 2003-11-25 devnull break
47 08708877 2003-11-25 devnull if(k > wlen)
48 08708877 2003-11-25 devnull return 0
49 08708877 2003-11-25 devnull d = k-suflen
50 08708877 2003-11-25 devnull if(d < 0)
51 08708877 2003-11-25 devnull d = -d
52 08708877 2003-11-25 devnull if(d > 3)
53 08708877 2003-11-25 devnull return 0
54 08708877 2003-11-25 devnull return k
55 08708877 2003-11-25 devnull }
56 08708877 2003-11-25 devnull }