-
Notifications
You must be signed in to change notification settings - Fork 2
/
convert_to_moses.py
executable file
·127 lines (105 loc) · 4.46 KB
/
convert_to_moses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
# Fri Nov 29 15:57:05 2019 by Masaaki Nagata
# LDCの中英単語対応データからmoses形式に変換する
# convert_to_giza.py < XXX.ldc > XXX.moses
# waファイルがrejectならば捨てる。
# token mismatch が speaker tag ^[), time stamp ^<, empty category ^* 以外なら
# その文は捨てる。
# 2019/11/29
# cmn-NG-31-111576-3460874-S1.eng.{raw,tkn} のような例に対応する。
# raw: If [we//you//one] [take//takes] a comprehensive look at
# tokenized: If we take a comprehensive look at
import sys
import argparse
import numpy as np
import re
def find_start_charindex(toks, sent):
offset = 0
tok_to_charindex = []
for tok in toks:
idx = sent.find(tok, offset) # tokenが小文字なのでsentを小文字に
#print (sent, offset, tok, idx)
if idx >= 0:
tok_to_charindex.append(idx)
offset = idx + len(tok)
else:
tok_to_charindex.append(-1)
#print("token mismatch:", tok, offset, sent, file=sys.stederr)
return tok_to_charindex
def main(args):
n_lines = 0
for line in sys.stdin:
lines = line.rstrip('\n').split('\t') # 改行を取り除き、タブで区切る
n_lines += 1
if lines[2] == 'rejected': # 単語対応データがない
continue
(f_line, e_line, a_line, f_orig, e_orig) = lines
f_toks = f_line.strip().split(' ')
e_toks = e_line.strip().split(' ')
f_tok_to_charindex = find_start_charindex(f_toks, f_orig)
e_tok_to_charindex = find_start_charindex(e_toks, e_orig)
if args.verbose:
if any([i == -1 for i in f_tok_to_charindex]):
print(' '.join(f_toks), f_orig,
' '.join(map(str, f_tok_to_charindex)),
file=sys.stderr)
if any([i == -1 for i in e_tok_to_charindex]):
print(' '.join(e_toks), e_orig,
' '.join(map(str, e_tok_to_charindex)),
file=sys.stderr)
a_toks = a_line.strip().split(' ')
new_a_toks = []
for token in a_toks:
token = re.sub(r'\[\w+\]', '', token) # [DET]などを削除
token = re.sub(r'\(\w+\)', '', token) # (GIS)などを削除
new_a_toks.append(token)
a_matrix = np.zeros((len(f_toks), len(e_toks)), dtype=int)
f_indexes = set()
e_indexes = set()
for a_tok in new_a_toks:
(f_seq, e_seq) = a_tok.split('-')
for f in f_seq.split(','):
if f:
f_indexes.add(int(f))
for e in e_seq.split(','):
if e:
e_indexes.add(int(e))
if f and e:
a_matrix[int(f)-1, int(e)-1] = 1
new_f_toks = []
new_f_tok_to_charindex = []
for j, f_tok in enumerate(f_toks):
if f_tok_to_charindex[j] != -1:
new_f_toks.append(f_tok)
new_f_tok_to_charindex.append(f_tok_to_charindex[j])
new_e_toks = []
new_e_tok_to_charindex = []
for i, e_tok in enumerate(e_toks):
if e_tok_to_charindex[i] != -1:
new_e_toks.append(e_tok)
new_e_tok_to_charindex.append(e_tok_to_charindex[i])
f_bool = (np.array(f_tok_to_charindex) != -1)
e_bool = (np.array(e_tok_to_charindex) != -1)
new_a_matrix = a_matrix[np.ix_(f_bool,e_bool)]
#print(new_a_matrix)
new_a_toks = []
for i, f_tok in enumerate(new_f_toks):
for j, e_tok in enumerate(new_e_toks):
if new_a_matrix[i,j] == 1:
new_a_toks.append('{}-{}'.format(i,j))
#print(i, new_f_toks[i], j, new_e_toks[j])
# 出力すべきデータがあれば出力する。
if new_f_toks and new_e_toks and new_a_toks:
new_f_line = ' '.join(new_f_toks)
new_e_line = ' '.join(new_e_toks)
new_a_line = ' '.join(new_a_toks)
print("\t".join([new_f_line, new_e_line, new_a_line,
f_orig, e_orig]))
# new_a_matrix = np.zeros((len(new_f_toks), len(new_e_toks)), dtype=int)
# new_a_matrix = a_matrix[f_bool:e_bool]
# print(new_a_matrix)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--verbose', '-v', action='store_true')
args = parser.parse_args()
main(args)