Project management of NTIS P1 Cybernetic Systems and Department of Cybernetics | WiKKY

Project

General

Profile

Task #3941 » pros_words.py

Hanzlíček Zdeněk, 16.06.2016 16:11

 
1
# coding: utf-8
2

    
3
import argparse
4
import codecs
5
import asflight
6

    
7

    
8
# monosyllabic prepositions + conjuctions a, i
9
proclitics_ver1 = [
10
    u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
11
    u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
12
    u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod" ]
13

    
14
# version #1 + monosyllabic relative pronouns
15
proclitics_ver2 = [
16
    u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
17
    u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
18
    u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod",
19
    u"kdo", u"co", u"čí", u"jenž", u"jež", u"již", u"jíž", u"jichž", u"jimž",
20
    u"němž", u"níž", u"nichž", u"jímž", u"kom", u"kým", u"čem", u"čím", u"čím", u"čích" ]
21

    
22
# list of enclitic pronouns
23
enclitics = [ u"se", u"si" ]
24

    
25

    
26
# unit keys: phone, pwordBoundPos, pphrsBoundPos, prosodeme, word
27

    
28

    
29
def set_pwords( unit_list, ver=1 ):
30

    
31
    if ver == 1:
32
        proclitics = proclitics_ver1
33
    elif ver == 2:
34
        proclitics = proclitics_ver2
35

    
36
    # ----- process proclitics
37

    
38
    unit_prev = None
39
    append = False
40

    
41
    for unit in unit_list:
42

    
43
        if unit['pwordBoundPos'] == 'P':
44
            if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
45
                unit_prev['pwordBoundPos'] = 'L'
46
            append = False
47

    
48
        elif ( unit['word'] is not None ) and ( unit['word'] != '.' ):
49

    
50
            if append is False:
51
                if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
52
                    unit_prev['pwordBoundPos'] = 'L'
53
                unit['pwordBoundPos'] = 'F'
54

    
55
            else:
56
                if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ) and ( unit_prev['pwordBoundPos'] != 'F' ):
57
                     unit_prev['pwordBoundPos'] = "-"
58
                unit['pwordBoundPos'] = "-"
59

    
60
            append = ( unit['word'].lower() in proclitics )
61

    
62
        unit_prev = unit
63

    
64
    # ----- process enclitic
65

    
66
    append = False
67
    phr_end = False
68

    
69
    for unit in reversed( unit_list ):
70

    
71
        if ( unit['pphrsBoundPos'] == 'L' ):
72
            phr_end = True
73

    
74
        if append:
75
             unit['pwordBoundPos'] = '-'
76
             append = False
77

    
78
        if phr_end and ( unit['word'] is not None ) and ( unit['word'] != "." ):
79
            append = ( unit['word'].lower() in enclitics )
80
            if append:
81
                unit['pwordBoundPos'] = '-'
82
            phr_end = False
83

    
84
    # ----- correction of prosodemes
85

    
86
    prosodeme = None
87
    phr_end = False
88

    
89
    for unit in reversed( unit_list ):
90

    
91
        if ( unit['pphrsBoundPos'] == 'L' ):
92
            prosodeme = unit['prosodeme']
93

    
94
        if prosodeme is not None:
95
            unit['prosodeme'] = prosodeme
96

    
97
            if unit['pwordBoundPos'] == 'F':
98
                prosodeme = None
99

    
100
        elif ( unit['prosodeme'] != '0' ) and ( unit['prosodeme'] != 'X.X' ):
101
            unit['prosodeme'] = '0'
102

    
103

    
104
# ----------
105

    
106

    
107
def main():
108

    
109
    parser = argparse.ArgumentParser( description="Modify prosodic words in ASF file." )
110

    
111
    parser.add_argument( type=str, metavar="ASF_IN", dest="asfIn", help="input ASF file" )
112
    parser.add_argument( type=str, metavar="ASF_OUT", dest="asfOut", help="output ASF file" )
113
    parser.add_argument( "-c", "--code-page", type=str, metavar="CODEPAGE", dest="codePage",
114
                         help="encoding of all files, default value: %(default)s", default='utf-8' )
115

    
116
    args = parser.parse_args()
117

    
118
    asf = asflight.AsfLight( args.asfIn, args.codePage )
119

    
120
    for utt_name in asf:
121
        set_pwords( asf[ utt_name ] )
122

    
123
    asf.write( args.asfOut, args.codePage )
124

    
125

    
126
# ----------
127

    
128
# run the main program
129
if ( __name__ == "__main__" ):
130
    main()
(2-2/2)