| 
      1
     | 
    
      # coding: utf-8
 
     | 
  
  
    | 
      2
     | 
    
      
 
     | 
  
  
    | 
      3
     | 
    
      import argparse
 
     | 
  
  
    | 
      4
     | 
    
      import codecs
 
     | 
  
  
    | 
      5
     | 
    
      import asflight
 
     | 
  
  
    | 
      6
     | 
    
      
 
     | 
  
  
    | 
      7
     | 
    
      
 
     | 
  
  
    | 
      8
     | 
    
      # monosyllabic prepositions + conjuctions a, i
 
     | 
  
  
    | 
      9
     | 
    
      proclitics_ver1 = [
 
     | 
  
  
    | 
      10
     | 
    
          u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
 
     | 
  
  
    | 
      11
     | 
    
          u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
 
     | 
  
  
    | 
      12
     | 
    
          u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod" ]
 
     | 
  
  
    | 
      13
     | 
    
      
 
     | 
  
  
    | 
      14
     | 
    
      # version #1 + monosyllabic relative pronouns
 
     | 
  
  
    | 
      15
     | 
    
      proclitics_ver2 = [
 
     | 
  
  
    | 
      16
     | 
    
          u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
 
     | 
  
  
    | 
      17
     | 
    
          u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
 
     | 
  
  
    | 
      18
     | 
    
          u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod",
 
     | 
  
  
    | 
      19
     | 
    
          u"kdo", u"co", u"čí", u"jenž", u"jež", u"již", u"jíž", u"jichž", u"jimž",
 
     | 
  
  
    | 
      20
     | 
    
          u"němž", u"níž", u"nichž", u"jímž", u"kom", u"kým", u"čem", u"čím", u"čím", u"čích" ]
 
     | 
  
  
    | 
      21
     | 
    
      
 
     | 
  
  
    | 
      22
     | 
    
      # list of enclitic pronouns
 
     | 
  
  
    | 
      23
     | 
    
      enclitics = [ u"se", u"si" ]
 
     | 
  
  
    | 
      24
     | 
    
      
 
     | 
  
  
    | 
      25
     | 
    
      
 
     | 
  
  
    | 
      26
     | 
    
      # unit keys: phone, pwordBoundPos, pphrsBoundPos, prosodeme, word
 
     | 
  
  
    | 
      27
     | 
    
      
 
     | 
  
  
    | 
      28
     | 
    
      
 
     | 
  
  
    | 
      29
     | 
    
      def set_pwords( unit_list, ver=1 ):
 
     | 
  
  
    | 
      30
     | 
    
      
 
     | 
  
  
    | 
      31
     | 
    
          if ver == 1:
 
     | 
  
  
    | 
      32
     | 
    
              proclitics = proclitics_ver1
 
     | 
  
  
    | 
      33
     | 
    
          elif ver == 2:
 
     | 
  
  
    | 
      34
     | 
    
              proclitics = proclitics_ver2
 
     | 
  
  
    | 
      35
     | 
    
      
 
     | 
  
  
    | 
      36
     | 
    
          # ----- process proclitics
 
     | 
  
  
    | 
      37
     | 
    
      
 
     | 
  
  
    | 
      38
     | 
    
          unit_prev = None
 
     | 
  
  
    | 
      39
     | 
    
          append = False
 
     | 
  
  
    | 
      40
     | 
    
      
 
     | 
  
  
    | 
      41
     | 
    
          for unit in unit_list:
 
     | 
  
  
    | 
      42
     | 
    
      
 
     | 
  
  
    | 
      43
     | 
    
              if unit['pwordBoundPos'] == 'P':
 
     | 
  
  
    | 
      44
     | 
    
                  if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
 
     | 
  
  
    | 
      45
     | 
    
                      unit_prev['pwordBoundPos'] = 'L'
 
     | 
  
  
    | 
      46
     | 
    
                  append = False
 
     | 
  
  
    | 
      47
     | 
    
      
 
     | 
  
  
    | 
      48
     | 
    
              elif ( unit['word'] is not None ) and ( unit['word'] != '.' ):
 
     | 
  
  
    | 
      49
     | 
    
      
 
     | 
  
  
    | 
      50
     | 
    
                  if append is False:
 
     | 
  
  
    | 
      51
     | 
    
                      if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
 
     | 
  
  
    | 
      52
     | 
    
                          unit_prev['pwordBoundPos'] = 'L'
 
     | 
  
  
    | 
      53
     | 
    
                      unit['pwordBoundPos'] = 'F'
 
     | 
  
  
    | 
      54
     | 
    
      
 
     | 
  
  
    | 
      55
     | 
    
                  else:
 
     | 
  
  
    | 
      56
     | 
    
                      if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ) and ( unit_prev['pwordBoundPos'] != 'F' ):
 
     | 
  
  
    | 
      57
     | 
    
                           unit_prev['pwordBoundPos'] = "-"
 
     | 
  
  
    | 
      58
     | 
    
                      unit['pwordBoundPos'] = "-"
 
     | 
  
  
    | 
      59
     | 
    
      
 
     | 
  
  
    | 
      60
     | 
    
                  append = ( unit['word'].lower() in proclitics )
 
     | 
  
  
    | 
      61
     | 
    
      
 
     | 
  
  
    | 
      62
     | 
    
              unit_prev = unit
 
     | 
  
  
    | 
      63
     | 
    
      
 
     | 
  
  
    | 
      64
     | 
    
          # ----- process enclitic
 
     | 
  
  
    | 
      65
     | 
    
      
 
     | 
  
  
    | 
      66
     | 
    
          append = False
 
     | 
  
  
    | 
      67
     | 
    
          phr_end = False
 
     | 
  
  
    | 
      68
     | 
    
      
 
     | 
  
  
    | 
      69
     | 
    
          for unit in reversed( unit_list ):
 
     | 
  
  
    | 
      70
     | 
    
      
 
     | 
  
  
    | 
      71
     | 
    
              if ( unit['pphrsBoundPos'] == 'L' ):
 
     | 
  
  
    | 
      72
     | 
    
                  phr_end = True
 
     | 
  
  
    | 
      73
     | 
    
      
 
     | 
  
  
    | 
      74
     | 
    
              if append:
 
     | 
  
  
    | 
      75
     | 
    
                   unit['pwordBoundPos'] = '-'
 
     | 
  
  
    | 
      76
     | 
    
                   append = False
 
     | 
  
  
    | 
      77
     | 
    
      
 
     | 
  
  
    | 
      78
     | 
    
              if phr_end and ( unit['word'] is not None ) and ( unit['word'] != "." ):
 
     | 
  
  
    | 
      79
     | 
    
                  append = ( unit['word'].lower() in enclitics )
 
     | 
  
  
    | 
      80
     | 
    
                  if append:
 
     | 
  
  
    | 
      81
     | 
    
                      unit['pwordBoundPos'] = '-'
 
     | 
  
  
    | 
      82
     | 
    
                  phr_end = False
 
     | 
  
  
    | 
      83
     | 
    
      
 
     | 
  
  
    | 
      84
     | 
    
          # ----- correction of prosodemes
 
     | 
  
  
    | 
      85
     | 
    
      
 
     | 
  
  
    | 
      86
     | 
    
          prosodeme = None
 
     | 
  
  
    | 
      87
     | 
    
          phr_end = False
 
     | 
  
  
    | 
      88
     | 
    
      
 
     | 
  
  
    | 
      89
     | 
    
          for unit in reversed( unit_list ):
 
     | 
  
  
    | 
      90
     | 
    
      
 
     | 
  
  
    | 
      91
     | 
    
              if ( unit['pphrsBoundPos'] == 'L' ):
 
     | 
  
  
    | 
      92
     | 
    
                  prosodeme = unit['prosodeme']
 
     | 
  
  
    | 
      93
     | 
    
      
 
     | 
  
  
    | 
      94
     | 
    
              if prosodeme is not None:
 
     | 
  
  
    | 
      95
     | 
    
                  unit['prosodeme'] = prosodeme
 
     | 
  
  
    | 
      96
     | 
    
      
 
     | 
  
  
    | 
      97
     | 
    
                  if unit['pwordBoundPos'] == 'F':
 
     | 
  
  
    | 
      98
     | 
    
                      prosodeme = None
 
     | 
  
  
    | 
      99
     | 
    
      
 
     | 
  
  
    | 
      100
     | 
    
              elif ( unit['prosodeme'] != '0' ) and ( unit['prosodeme'] != 'X.X' ):
 
     | 
  
  
    | 
      101
     | 
    
                  unit['prosodeme'] = '0'
 
     | 
  
  
    | 
      102
     | 
    
      
 
     | 
  
  
    | 
      103
     | 
    
      
 
     | 
  
  
    | 
      104
     | 
    
      # ----------
 
     | 
  
  
    | 
      105
     | 
    
      
 
     | 
  
  
    | 
      106
     | 
    
      
 
     | 
  
  
    | 
      107
     | 
    
      def main():
 
     | 
  
  
    | 
      108
     | 
    
      
 
     | 
  
  
    | 
      109
     | 
    
          parser = argparse.ArgumentParser( description="Modify prosodic words in ASF file." )
 
     | 
  
  
    | 
      110
     | 
    
      
 
     | 
  
  
    | 
      111
     | 
    
          parser.add_argument( type=str, metavar="ASF_IN", dest="asfIn", help="input ASF file" )
 
     | 
  
  
    | 
      112
     | 
    
          parser.add_argument( type=str, metavar="ASF_OUT", dest="asfOut", help="output ASF file" )
 
     | 
  
  
    | 
      113
     | 
    
          parser.add_argument( "-c", "--code-page", type=str, metavar="CODEPAGE", dest="codePage",
 
     | 
  
  
    | 
      114
     | 
    
                               help="encoding of all files, default value: %(default)s", default='utf-8' )
 
     | 
  
  
    | 
      115
     | 
    
      
 
     | 
  
  
    | 
      116
     | 
    
          args = parser.parse_args()
 
     | 
  
  
    | 
      117
     | 
    
      
 
     | 
  
  
    | 
      118
     | 
    
          asf = asflight.AsfLight( args.asfIn, args.codePage )
 
     | 
  
  
    | 
      119
     | 
    
      
 
     | 
  
  
    | 
      120
     | 
    
          for utt_name in asf:
 
     | 
  
  
    | 
      121
     | 
    
              set_pwords( asf[ utt_name ] )
 
     | 
  
  
    | 
      122
     | 
    
      
 
     | 
  
  
    | 
      123
     | 
    
          asf.write( args.asfOut, args.codePage )
 
     | 
  
  
    | 
      124
     | 
    
      
 
     | 
  
  
    | 
      125
     | 
    
      
 
     | 
  
  
    | 
      126
     | 
    
      # ----------
 
     | 
  
  
    | 
      127
     | 
    
      
 
     | 
  
  
    | 
      128
     | 
    
      # run the main program
 
     | 
  
  
    | 
      129
     | 
    
      if ( __name__ == "__main__" ):
 
     | 
  
  
    | 
      130
     | 
    
          main()
 
     |