Blob


1 # when raw index has a lot of entries like
2 # 1578324 problematico, a, ci, che
3 # apply this algorithm:
4 # treat things after comma as suffixes
5 # for each suffix:
6 # if single letter, replace last letter
7 # else search backwards for beginning of suffix
8 # and if it leads to an old suffix of approximately
9 # the same length, put replace that suffix
10 # This will still leave some commas to fix by hand
11 # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
13 NF == 2 {
14 i = index($2, ",")
15 if(i == 0 || length($2) == 0)
16 print $0
17 else {
18 n = split($2, a, /,[ ]*/)
19 w = a[1]
20 printf "%s\t%s\n", $1, w
21 for(i = 2; i <= n; i++) {
22 suf = a[i]
23 m = matchsuflen(w, suf)
24 if(m) {
25 nw = substr(w, 1, length(w)-m) suf
26 printf "%s\t%s\n", $1, nw
27 } else
28 printf "%s\t%s\n", $1, w ", " suf
29 }
30 }
31 }
32 NF != 2 {
33 print $0
34 }
36 function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
37 {
38 wlen = length(w)
39 suflen = length(suf)
40 if(suflen == 1)
41 return 1
42 else {
43 c = substr(suf, 1, 1)
44 for (k = 1; k <= wlen ; k++)
45 if(substr(w, wlen-k+1, 1) == c)
46 break
47 if(k > wlen)
48 return 0
49 d = k-suflen
50 if(d < 0)
51 d = -d
52 if(d > 3)
53 return 0
54 return k
55 }
56 }