Module model
Source Code for Module model

   1  """ 
   2          Bachelor AI project 2009. Andreas van Cranenburgh 0440949. 
   3          Two-word stage dialogue simulator using a corpus of exemplars. 
   4   
   5          Interactive usage: 
   6          $ python model.py 
   7   
   8          Other uses: 
   9                  - Re-enact dialogue in 01.cha (should be a text file with one utterance 
  10                    per line, with each line preceeded by *MOT or *CHI, as in the CHILDES 
  11                    corpus): 
  12   
  13                    $ python 
  14  #                 >>> import model 
  15  #                 >>> model.evaluate(open('01.cha').read()) 
  16                    ... 
  17   
  18                  - Talk to self for 25 utterances: 
  19   
  20  #                 >>> model.selftalk(25) 
  21                    ... 
  22   
  23                  - Demonstrate some sample utterances that work (but not from corpus): 
  24   
  25  #                 >>> model.test() 
  26                    ... 
  27  """ 
  28   
  29  # (before continuing set tabstops to 4) 
  30  # update documentation by doing "pydoc -w model", then open "model.html" 
  31  # alternatively, use "epydoc model". 
  32   
  33  #random factoid: 
  34  #You can reference a list comprehension as it is being built by the symbol 
  35  #'_[1]'. For example, the following function unique-ifies a list of elements 
  36  #without changing their order by referencing its list comprehension. 
  37  # 
  38  #def unique(my_list): 
  39  #    return [x for x in my_list if x not in locals()['_[1]']] 
  40   
  41  try: 
  42          from free_will import choice 
  43  except ImportError: 
  44          from random import choice 
  45  from string import uppercase, lowercase 
  46  from collections import defaultdict 
  47  from pprint import pprint 
  48  #stands for pretty-print; library implementors have some strange standards 
  49  import math 
  50   
  51 -def main(): 
  52          """ Interactive textual user interface. """ 
  53          print "Child dialogue simulator.", 
  54          print "Enter parent utterance (or `quit' to quit)." 
  55          print 
  56          print "Utterances in corpus: " 
  57          print getexemplars().keys() 
  58          print 
  59          lexicon = inferlexicon(getexemplars(), True) 
  60          print 'Lexicon (distilled from corpus): ', lexicon 
  61          #test(getexemplars(), lexicon) 
  62          d = dialogue() 
  63          d.send(None) 
  64          while True: 
  65                  print "Parent: ", 
  66                  utt = raw_input() 
  67                  if utt == 'quit': 
  68                          break 
  69                  reply = d.send(utt) 
  70                  print "Child: ", reply 
  71   
  72 -def dbg(*m): 
  73          """ Print debug output (also see nodbg). 
  74                  Takes a variable number of arguments of any type. 
  75   
  76                  >>> dbg('choices', range(3)) 
  77                  choices [0, 1, 2] 
  78          """ 
  79          print (' '.join(list(str(a).expandtabs() for a in m))) 
  80 -def nodbg(*m): pass 
  81   
  82 -def dialogue(exemplars={}, constructions={}, lexicon={}, debug=dbg): 
  83          """ Co-routine for dialogue. Use send(utterance) to receive a reply. 
  84                  Initial input: exemplars 
  85                  Further input: one utterance at a time 
  86                  Output: one reply at a time""" 
  87          if not exemplars: 
  88                  exemplars = getexemplars() 
  89          if not lexicon: 
  90                  lexicon = inferlexicon(exemplars, False) 
  91          if not constructions: 
  92                  constructions = inferconstructions(exemplars, lexicon) 
  93          discourse, topic, reactionutt = [('','')], '', '' 
  94          while True: 
  95                  utt = yield reactionutt 
  96                  newmeaning = interpret(utt, topic, exemplars, lexicon) 
  97                  if 'reinforcement' in newmeaning and len(discourse) > 1: 
  98                          reinforce(meaning, reaction, reactionutt, discourse, exemplars) 
  99                  meaning = newmeaning 
 100                  debug(' interpretation:', meaning) 
 101   
 102                  #todo: detect correction feedback, store. DISABLED 
 103                  if False and utt not in exemplars and meaning: 
 104                          if meaning == lexiconcheck(utt, meaning, lexicon): 
 105                                  debug('added utterance to exemplars') 
 106                                  exemplars[utt] = meaning 
 107                                  constructions = inferconstructions(exemplars, lexicon) 
 108                          else: 
 109                                  debug('generation check failed, not added to exemplars') 
 110   
 111                  reaction = response(meaning, exemplars) 
 112                  debug(' reaction:', reaction) 
 113                  reactionutt = express2(reaction, exemplars, constructions, lexicon) 
 114                  discourse.append((utt, meaning)) 
 115                  discourse.append((reactionutt, reaction)) 
 116                  # if the current topic is not present in the parent's utterance, 
 117                  # see if there is a new one. otherwise there is topic continuity 
 118                  if (topic not in meaning and topic) or not topic: 
 119                          topic = findtopic(discourse) or '' 
 120   
 121 -def selftalk(u=25, e={}, debug=dbg): 
 122          """ Talk to oneself, picking a random utterance from the exemplars when 
 123                  repetition is detected. 
 124                  Input: number of utterances to generate, exemplars 
 125                  Output: transcript of conversation 
 126          """ 
 127          if not e: 
 128                  e = getexemplars() 
 129          a = choice(e.keys()) 
 130          speaking, listening = '*MOT', '*CHI' 
 131          said = set([a]) 
 132          c = [] 
 133   
 134          d = dialogue(e) 
 135          d.send(None) 
 136   
 137          for x in range(u): 
 138                  c.append('%s: %s' % (speaking, a)) 
 139                  debug(c[-1]) 
 140                  speaking, listening = listening, speaking 
 141                  r = d.send(a) 
 142                  if a == r and speaking == '*MOT': 
 143                          debug('picking random utterance') 
 144                          # because sets are not subscriptable, it is not possible to apply 
 145                          # choice to sets, perhaps this indicates that the axiom of choice 
 146                          # is untenable after all?! 
 147                          a = choice(list(set(e.keys()) - said)) 
 148                          said.add(a) 
 149                  else: 
 150                          a = r 
 151          print '\n', '\n'.join(c) 
 152   
 153 -def evaluate(conv, exemplars={}, n=1): 
 154          """ When presented with a corpus fragment, feed parent utterances to model  
 155                  and generate model responses in place of child utterances. 
 156                  n = number of iterations to perform 
 157                  Input: string with transcript of corpus 
 158                  Output: same transcript with replies from model instead of child 
 159          """ 
 160          # idea: only continue for loop if model answer matches child answer? 
 161          # this allows reinforcement to select correct answers 
 162          if not exemplars: 
 163                  exemplars = getexemplars() 
 164          # start & initialize co-routine 
 165          d = dialogue(exemplars) 
 166          d.send(None) 
 167          for a in range(n): 
 168                  modelconv = [] 
 169                  for b in conv.split('\n'): 
 170                          try: 
 171                                  who, utt = b.split(':', 1) 
 172                          except ValueError: 
 173                                  # ignore unparsable lines: 
 174                                  continue 
 175                          if 'MOT' in who: 
 176                                  print b 
 177                                  ans = '*CHI:  ' + d.send(utt) 
 178                                  print ans 
 179                                  for x in (b, ans): 
 180                                          modelconv.append(x) 
 181          print 
 182          return '\n'.join((str(a) for a in modelconv)) 
 183   
 184 -def reinforce(meaning, reaction, reactionutt, discourse, exemplars): 
 185          """ Strengthen connection between last two utterances by adding  
 186                  a random identifier and adding or updating the resulting exemplar. 
 187                  Input: two utterances with meanings 
 188                  Side-effect: updated or added exemplars 
 189          """ 
 190          randomid = "".join(choice(lowercase) for a in range(5)) 
 191          if discourse[-1][0] in exemplars: 
 192                  print 'updated', 
 193          else: 
 194                  print 'added', 
 195          exemplars[discourse[-1][0]] = meaning + ' ' + randomid 
 196          print 'reinforcement:', exemplars[discourse[-1][0]] 
 197          if reactionutt in exemplars: 
 198                  print 'updated', 
 199          else: 
 200                  print 'added', 
 201          print 'with', 
 202          exemplars[reactionutt] = reaction + ' ' + randomid 
 203          print exemplars[reactionutt] 
 204   
 205 -def findtopic(discourse, debug=dbg): 
 206          """ Look for a recurring element in the last two utterances, 
 207                  if found, this becomes the new topic. 
 208                  Input: discourse (list of utterances and meanings) 
 209                  Output: clause, or None 
 210          """ 
 211          # prefer second clause, then rest. 
 212          def minprefertwo(seq): 
 213                  seq = list(seq) 
 214                  try: 
 215                          return ((a,b) for a,b in seq if a==2).next() 
 216                  except StopIteration: 
 217                          return min(seq) 
 218   
 219          # take intersection of last utterance with second to last utterance 
 220          # (ignoring operators) 
 221          m1, m2 = discourse[-1][1].split(), discourse[-2][1].split() 
 222          common = intersect(m1[1:], m2) 
 223          if common: 
 224                  # something went wrong with keeping this simple; apologies. 
 225                  topic = minprefertwo( 
 226                          (minprefertwo( 
 227                                  ((m1.index(a), a), 
 228                                  (m2.index(a), a))) for a in common))[1] 
 229                  debug(' topic:', topic) 
 230                  return topic 
 231   
 232 -def interpret(utterance, topic, exemplars, lexicon, debug=dbg): 
 233          """ Return semantic representation for linguistic utterance. 
 234                  This function is in charge of backtracking over initial exemplars 
 235                  and picking the best result, the rest is done by interpretwith() 
 236                  Input: utterance 
 237                  Output: meaning representation 
 238          """ 
 239          def match(a, b): 
 240                  return len(intersect(a, b.split())) 
 241   
 242          utterance = utterance.replace('[= ', '[=') 
 243          words = utterance.split() 
 244          initial = [(match(words, a), a, b) for a,b in exemplars.items()] 
 245   
 246          # backtrack over all exemplars with any overlap at all: 
 247          #initial = [a[1:] for a in sorted(initial, reverse=True) if a[0]] 
 248   
 249          #only backtrack over exemplars with highest overlap (much faster): 
 250          n = max(initial)[0] 
 251          if n: 
 252                  initial = [a[1:] for a in initial if a[0] == n] 
 253          else: 
 254                  # if none of the words are in corpus we can just forget about it: 
 255                  return '' 
 256   
 257          i = [] 
 258          for exutt, exmeaning in initial: 
 259                  #print 'trying: %s' % a[1] 
 260                  w = [b for b in words if b not in exutt] 
 261                  n, meaning = interpretwith(w, exmeaning, exemplars, lexicon, nodbg) 
 262                  if meaning: 
 263                          # scoring (heuristics): 
 264                          #  1: distance to generation check (lexicon) 
 265                          #  2: nr of exemplars,  
 266                          #  3: inverse of meaning length 
 267                          #  4: word lengths of matches in initial exemplar  
 268                          #               (longer words are less likely to be  
 269                          #               unimportant noise/function words). 
 270                          #  5: nr of substitutions,  
 271                          # TODO heuristic: amount of pairwise overlap between exemplars used 
 272                          # this would require book-keeping during recursion.. 
 273                          i.append(( 
 274                                  n, 
 275                                  edit_dist(lexiconcheck(utterance, meaning, lexicon), meaning), 
 276                                  1.0 / len(tokens(meaning)), 
 277                                  #edit_dist(exmeaning, meaning), 
 278                                  #1.0 / sum(len(a) for a in (b for b in words if b in exutt)), 
 279                                  #end of heuristics. initial exemplar: 
 280                                  (exutt, exmeaning))) 
 281          #to debug choice of initial exemplar, enable this to see all candidates: 
 282          #pprint(sorted(i)) 
 283          if i: 
 284                  # find lowest score: 
 285                  a = min(i)[-1] 
 286                  debug('initial exemplar:') 
 287                  debug('\t', a) 
 288   
 289                  # try to merge topic (but don't care if it fails, avoid substition): 
 290                  c = conciliate(topic, a[1], varclauses(b)) 
 291                  if c: 
 292                          b = c 
 293                  else: 
 294                          b = a[1] 
 295   
 296                  # do interpretation again, this time with debug output: 
 297                  w = [c for c in words if c not in a[0].split()] 
 298                  debug('words left:', repr(' '.join(w))) 
 299                  if w: 
 300                          return interpretwith(w, b, exemplars, lexicon, debug)[1] 
 301                  #else: no words left to interpret 
 302                  return b 
 303          else: 
 304                  return '' 
 305   
 306 -def lexiconcheck(utt, meaning, lex): 
 307          # generation check, filter interpretation  
 308          # by that which is present according to lexicon. 
 309          # disabled because the lexicon is too incomplete for this. 
 310          def elim(a): 
 311                  return a.replace('[=%s]' % a[2:-1], a[2:-1]).strip().lower() 
 312          revlex = revdict(lex) 
 313          # map utterance to lexicon items 
 314          uttlex =  " ".join(lex[elim(b)] for b in utt.split() if elim(b) in lex) 
 315          # reconstruct interpretation according to own knowledge 
 316          # select those clauses that occur in reverse lexicon 
 317          # or in utterance --> lexicon mapping 
 318          # don't be picky with speech acts 
 319          def check(a): 
 320                  return a not in revlex or a in uttlex or a[-1] == ':' 
 321          return " ".join((a for a in meaning.split() if check(a))) 
 322   
 323   
 324 -def interpretwith(words, partialmeaning, exemplars, lexicon, debug=dbg): 
 325          """ Interpretation helper function called by interpret(), work out meaning 
 326                  of remaining words by stitching together the best matching fragments. 
 327          """ 
 328          l = len(words) 
 329   
 330          # lexicon check is too permissive 
 331          #if words[0] not in lexicon: 
 332          # skip over words not in corpus 
 333          if words and words[0][0] != '[': 
 334                  if words[0] not in ' '.join(exemplars.keys()).split(): 
 335                          debug("skipping", repr(words[0])) 
 336                          words = words[1:] 
 337   
 338          # handle demonstratives 
 339          # eg. "this [=ball]" should add the meaning of "ball" 
 340          if partialmeaning and [w for w in words if '[' in w]: 
 341                  word = [w for w in words if '[' in w][0][2:-1].strip().lower() 
 342                  # TODO multi-word, eg. ice cream ... 
 343                  if word in lexicon: 
 344                          pm = conciliate(lexicon[word], partialmeaning, None, debug) 
 345                          if pm: 
 346                                  partialmeaning = pm 
 347                                  debug("demonstrative dereferenced:", partialmeaning) 
 348                          else: 
 349                                  # fail! 
 350                                  return 0, '' 
 351                                  partialmeaning[0] += ' ' + lexicon[word] 
 352                                  debug("appended demonstrative:", partialmeaning) 
 353                  words = [a for a in words if a[0] != '['] 
 354   
 355          for sub in substr_iter(words): 
 356                  substring = " ".join(sub) 
 357                  for b in exemplars: 
 358                          # check whether the substring occurs and match whole words only 
 359                          if substring in b and set(sub).issubset(b.split()): 
 360                                  pm = partialmeaning 
 361                                  # turn on substition for known words 
 362                                  subst = [lexicon[word] for word in sub if word in lexicon] 
 363                                  pm = conciliate(exemplars[b], partialmeaning, subst + [''], debug) 
 364                                  if pm: 
 365                                          # remove all words in exemplar from queue 
 366                                          #if c in b.split() 
 367                                          matches = [c for c in words if c in sub] 
 368                                          words = [c for c in words if c not in sub] 
 369                                          debug(' ', repr(' '.join(matches)), 'in', repr(b)) 
 370                                          debug(' and', repr(exemplars[b])) 
 371                                          debug(' matches', repr(pm)) 
 372                                          #recurse: 
 373                                          m, p = interpretwith(words, pm, exemplars, lexicon, debug) 
 374                                          return 1 + m, p 
 375          # no match yet? 
 376          if words: 
 377                  return 0, '' #partialmeaning[0] 
 378          return 0, partialmeaning 
 379   
 380 -def response(meaning, exemplars, debug=dbg): 
 381          """ Transform a meaning into a response using adjacency pairs of speech 
 382                  acts. 
 383                  Input: meaning representation 
 384                  Output: meaning representation 
 385          """ 
 386          def vars(m): 
 387                  return len([a for a in tokens(m) if var(a)]) 
 388          if 'imperative' in meaning: 
 389                  # refusal doesn't occur in corpus, so disabled: 
 390                  #op = choice(('acknowledgement: ', 'refusal: ')) 
 391                  return 'acknowledgement ' + meaning.split(':')[1] 
 392          elif 'ynquestion' in meaning: 
 393                  op = choice(('agreement', 'denial')) 
 394                  return op 
 395          elif 'whquestion' in meaning: 
 396                  if not ':' in meaning: 
 397                          return '' 
 398                  m = 'assertion:' + meaning.split(':', 1)[1] 
 399                  #d = [a for a in exemplars.values() if conciliate(a, [m])] 
 400                  # edit distance on clauses, plus a penalty for variables 
 401                  # (this penalty implements a bias for definite information): 
 402                  def score(m, a): 
 403                          return edit_dist(tokens(m), tokens(a)) + 0.5*vars(a) 
 404                  d = [(score(m, a), a) for a in exemplars.values()] 
 405                  #print sorted(d) 
 406                  if min(d)[0] < 1: 
 407                          #choices = [a[1] for a in d if a[0] == min(d)[0]] 
 408                          #max variance to be eligible is twice the minimal distance 
 409                          choices = [a[1] for a in d if a[0] < 1] 
 410                          debug("possible reactions:", sorted(choices)) 
 411                          if choices: 
 412                                  r = choice(choices) 
 413                          else: 
 414                                  return '' 
 415                          m = conciliate(r, m, varclauses(m)) 
 416                          if m: 
 417                                  n = unifies(r, m) 
 418                                  if n: 
 419                                          if len(choices) > 1: 
 420                                                  debug('choice:', m) 
 421                                          return n 
 422                                  if len(choices) > 1: 
 423                                          debug('choice:', m) 
 424                                  return m 
 425                  else: 
 426                          return m #'' 
 427          #here we can decide to take spontaneous initiative (eg. "more juice!") 
 428          elif 'assertion' in meaning: 
 429                  #re-phrase as two word utterance 
 430                  return meaning 
 431                  #return 'acknowledgement' 
 432   
 433          # here we could express confusion 
 434          return '' 
 435   
 436 -def express(meaning, exemplars, lexicon): 
 437          """ Express `meaning' by returning the most similar exemplar. """ 
 438          if meaning in exemplars.values(): 
 439                  return choice(revdict(exemplars)[meaning]) 
 440          else: 
 441                  #find approximation 
 442                  # edit distance on clauses: 
 443                  def dist(m, a): 
 444                          return edit_dist(m.split(), a.split()) 
 445                  d = [(dist(meaning, b), a) for a, b in exemplars.items()] 
 446                  if min(d)[0] < len(meaning): 
 447                          return choice([a for a in d if a[0] == min(d)[0]])[1] 
 448                  else: 
 449                          return '0' 
 450   
 451 -def expressmulti(meaning, exemplars, constructions, lexicon): 
 452          """ Express `meaning' by returning a matching exemplar or construction """ 
 453          # exemplar filtered according to constructions 
 454          def check(a): 
 455                  return a in ' '.join(constructions.keys()) 
 456          if meaning in exemplars: 
 457                  return ' '.join(a for a in exemplars[meaning] if check(a)) 
 458          # try constructions 
 459          if meaning in constructions.values(): 
 460                  return choice(revdict(constructions)[meaning]) 
 461          # this sounds a little poetic: 
 462          if any(meaning in clauses for clauses in constructions.values()): 
 463                  return choice([utt for utt, clauses in constructions.items() 
 464                          if meaning in clauses]) 
 465          if any(clauses in meaning for clauses in constructions.values()): 
 466                  return choice([utt for utt, clauses in constructions.items() 
 467                          if clauses in meaning]) 
 468          # fall back on two word stage 
 469          return express2(meaning, exemplars, constructions, lexicon) 
 470   
 471 -def express2(meaning, exemplars, constructions, lexicon, debug=dbg): 
 472          """ Transform a meaning into a two word utterance using exemplars, 
 473                  constructions or the lexicon. Filter result according to lexical 
 474                  knowledge. 
 475                  Input: meaning representation 
 476                  Output: one or two word utterance 
 477          """ 
 478          # filter out words not in lexicon 
 479          def reduce(utt): 
 480                  r=' '.join(a for a in utt.split() if a.lower() in lexicon) 
 481                  if r == None: 
 482                          return '' 
 483                  return r 
 484   
 485          # first try to find a matching exemplar 
 486          if meaning in exemplars.values(): 
 487                  utt = choice(revdict(exemplars)[meaning]) 
 488                  debug('reduced:', utt) 
 489                  return reduce(utt) 
 490   
 491          # try constructions 
 492          if meaning in constructions.values(): 
 493                  utt = choice(revdict(constructions)[meaning]) 
 494                  debug('reduced:', utt) 
 495                  return reduce(utt) 
 496          # this sounds a little poetic: 
 497          if any(meaning in clauses for clauses in constructions.values()): 
 498                  utt = choice([utt for utt, clauses in constructions.items() 
 499                          if meaning in clauses]) 
 500                  debug('reduced:', utt) 
 501                  return reduce(utt) 
 502   
 503          # if that fails: 
 504          # use lexicon to fetch possible words, pick two and string together 
 505          revlex = revdict(lexicon) 
 506          if ':' in meaning: 
 507                  # might want to do something with operators as well? 
 508                  meaning = meaning.split() 
 509          else: 
 510                  # operator without further clauses 
 511                  if meaning in revlex: 
 512                          return revlex[meaning] 
 513                  else: 
 514                          return '' 
 515          if not arg(meaning[1]) or arg(meaning[1])[1] not in uppercase: 
 516                  clause = meaning[1] 
 517          else: 
 518                  return '' 
 519          debug("trying to express:", clause) 
 520   
 521          # express first clause if in lexicon: 
 522          for a in meaning[1:]: 
 523                  if a in revlex: 
 524                          return choice(revlex[a]) 
 525          if clause in revlex: 
 526                  return revlex[clause].pop() 
 527   
 528          # compose from lexicon using predicate and argument 
 529          topic = arg(meaning[1]) 
 530          if topic and topic[0] in uppercase: 
 531                  if len(meaning) > 2: 
 532                          topic = pred(meaning[2]) 
 533                  else: 
 534                          topic = meaning[0] 
 535          comment = pred(meaning[1]) 
 536   
 537          candidates = [a for a,b in lexicon.items() if comment in b] 
 538          if candidates: 
 539                  utt = choice([a for a,b in lexicon.items() if comment in b]) 
 540                  debug('comment:', comment, 'in', utt) 
 541          else: 
 542                  utt = '' 
 543   
 544          # decide whether to express topic. TODO: think of non-random way to decide 
 545          if choice([0, 1]): 
 546                  return utt 
 547   
 548          candidates = [a for a,b in lexicon.items() if topic in b and utt not in a] 
 549          if candidates: 
 550                  a = choice(candidates) 
 551                  utt = a + ' ' + utt 
 552                  debug('topic:', topic, 'in', utt) 
 553          else: 
 554                  debug('topic not expressible?', topic) 
 555          return utt 
 556   
 557 -def inferconstructions(exemplars, lexicon, constructions={}, debug=dbg): 
 558          """ Build corpus of constructions from exemplars and lexicon. 
 559                  Input: exemplars & lexicon. 
 560                  Output: constructions, dictionary of pairings between substrings 
 561                  and clauses. """ 
 562          debug('distilling constructions') 
 563          # take substr_iter of all sentences, chain these 
 564          scores = defaultdict(int) 
 565          n = len(constructions) 
 566          for utt, meaning in exemplars.items(): 
 567                  for a in substr_iter(utt.split()): 
 568                          # constructions consist of two or more words (lexicon otherwise): 
 569                          if len(a) == 1: 
 570                                  break 
 571                          # filter out non-alphabetic characters 
 572                          words = ' '.join(b.lower() for b in a if all(c.lower() in lowercase for c in b)) 
 573                          if ' ' not in words: 
 574                                  continue 
 575                          # increment count in constructions if this one is unknown 
 576                          if not any(words in b for b in constructions): 
 577                                  score = len([c for c in exemplars if words in c]) 
 578                                  if score: 
 579                                          scores[words] = score 
 580          # shouldn't this be a priority queue instead? 
 581          # iterate over counts, take highest counts and pick longest constructions 
 582          for score, form in sorted(revdict(scores).items()): 
 583                  form = form[0] 
 584                  if form not in scores: 
 585                          # skip this construction if it has been removed meanwhile 
 586                          continue 
 587                  for a in scores.keys(): 
 588                          # remove shorter constructions 
 589                          if a in form and form != a: 
 590                                  scores.pop(a) 
 591                          # longer constructions are OK 
 592                          # however: this means that the shorter one should abstract! 
 593                          # use lexicon to check presence of words. 
 594   
 595                  # replace score with meaning, intersect like inferlexicon 
 596                  meaning = findmeaning(form, exemplars, lexicon, dbg) 
 597                  if meaning: 
 598                          debug('construction (score %d)' % score, repr(form), repr(meaning)) 
 599                          constructions[form] = meaning 
 600                  else: 
 601                          ms = [meaning for utt, meaning in exemplars.items() if form in utt] 
 602                          score = '(score %d)' % score 
 603                          candidate = 'not found in candidates %s', str(ms) 
 604                          debug('meaning of construction', repr(form), score, candidate) 
 605                  # if not found the next pass might find it by elimination 
 606   
 607          #print sorted(revdict(scores).items()) 
 608          # base case (no change) 
 609          if n == len(constructions): 
 610                  return constructions 
 611          # recurse  
 612          return inferconstructions(exemplars, lexicon) 
 613   
 614 -def findmeaning(form, exemplars, lexicon, debug=dbg): 
 615          """ Given the words in a construction, find the most common meaning 
 616                  associated with it in the corpus of exemplars. """ 
 617          ms = [meaning for utt, meaning in exemplars.items() if form in utt] 
 618          #print ms 
 619          #ehh, why this whole intersection business? 
 620          revlex = revdict(lexicon) 
 621          if len(ms) > 1: 
 622                  m = [a.split() for a in ms] 
 623                  m = reduce(intersect, m[1:], m[0]) 
 624                  #print m 
 625                  if m: 
 626                          n = [] 
 627                          # if some word for a clause is not present in construction,  
 628                          # abstract its meaning 
 629                          def abstract(a): 
 630                                  if a in revlex and any(b in form for b in revlex[a]): 
 631                                          return a 
 632                                  else: 
 633                                          if a[0] == '(': 
 634                                                  return '(X)' 
 635                                  #abstract operators 
 636                                  return a #'' 
 637                          return ' '.join(abstract(a) for a in m) 
 638                  else: 
 639                          # prefer most frequent sub-interpretation (allow non-contiguous?) 
 640                          # --> combinations instead 
 641                          return max((ms.count(a), a) for a in ms)[1] 
 642                          scores = defaultdict(int) 
 643                          for meaning in ms: 
 644                                  for a in substr_iter(meaning.split()): 
 645                                          m = ' '.join(a) 
 646                                          # increment count in constructions if this one is unknown 
 647                                          scores[m] += 1 
 648                          return max(revdict(scores))[1][0] 
 649                  #elif len(m) > 1: 
 650                  #       # if multiple possibilities arise, prefer least frequent predicate 
 651                  #       def freq(a): 
 652                  #               return ' '.join(exemplars.values()).count(pred(a)) 
 653                  #       return ' '.join(min((freq(a), a) for a in m)[1]) 
 654          if ms: 
 655                  return ms.pop() 
 656   
 657  # a mutable default argument is essentially a static variable (yay for arcana!) 
 658 -def inferlexicon(exemplars, verbose=False, lexicon={}, debug=dbg): 
 659          """ Infer lexicon from corpus of exemplars. 
 660                  Input: exemplars. 
 661                  Output: lexicon, dictionary of word-clause pairings. """ 
 662          exlcase = dict((a.lower(), b) for a,b in exemplars.items()) 
 663          n = len(lexicon) 
 664          # look for word indices linking meanings to words: 
 665          for utt,m in exemplars.items(): 
 666                  if '[' in m: 
 667                          n = int(m[m.index('[')+1 : m.index(']')]) 
 668                          clause = m[:m.index('[')].split()[-1] 
 669                          clause += m[m.index(']')+1:].split()[0] 
 670                          word = utt.split()[n].lower() 
 671                          exemplars[utt] = m[:m.index('[')] + m[m.index(']')+1:] 
 672                          if word not in lexicon: 
 673                                  lexicon[word] = clause 
 674          for word in set(" ".join(exemplars.keys()).split()) - set(lexicon.keys()): 
 675                  word = word.lower() 
 676                  utterances = [a.split() for a in exlcase.keys() if word in a.split()] 
 677                  meanings = [b.split() for a,b in exlcase.items() if word in a.split()] 
 678   
 679                  # reduce meanings by matching words with meanings: 
 680                  for a in list(reduce(intersect, meanings, meanings[0])): #[::-1]: 
 681                          if word in a: 
 682                                  if len(a) == 1: 
 683                                          lexicon[word] = a 
 684                                  else: 
 685                                          # select least frequent predicate (assumption is that  
 686                                          # this predicate is most specific to this word) 
 687                                          def freq(b): 
 688                                                  return ' '.join(exemplars.values()).count(pred(b)) 
 689                                          candidates = ((freq(b), b) for b in a.split()) 
 690                                          lexicon[word] = min(candidates)[1] 
 691                                  break 
 692                  # if this didn't succeed and the set of exemplars which contain the 
 693                  # word reduces to a single meaning, use this: 
 694                  #DISABLED 
 695                  if False and word not in lexicon: 
 696                          a = reduce(intersect, utterances, utterances[0]) 
 697                          if len(a - set(lexicon.keys())) == 1: 
 698                                  lexicon[word] = reduce(intersect, meanings, meanings[0]) 
 699                                  if len(lexicon[word]) == 0: 
 700                                          lexicon[word] = '' 
 701                                  else: 
 702                                          lexicon[word] = lexicon[word].pop() 
 703                  # a single word utterance carrying a single meaning: 
 704                  if len(utterances) == len(meanings) == len(utterances[0]): 
 705                          if len(utterances) == len(meanings[0]) == 1: 
 706                                  lexicon[word] = meanings[0][0] 
 707          if verbose: 
 708                  notfound = set(' '.join(exlcase.keys()).split()) - set(lexicon.keys()) 
 709                  debug('lexicon: not found:', notfound, '\n') 
 710          # recurse as long as new meanings are inferred: 
 711          if len(lexicon) == n: 
 712                  return inferlexicon(exemplars, verbose) 
 713          return lexicon 
 714   
 715 -def conciliate(meaning, partialmeaning, subst=None, debug=dbg): 
 716          """ Test whether meaning can be made comptabible with partialmeaning, 
 717                  by looking for a family resemblance with partialmeanings,  
 718                  if found, perform a substitution if necessary. Returns empty string on failure. 
 719   
 720                  >>> conciliate('question: animal(bunny) do(X)', 'assertion: point(dog) animal(dog)') 
 721                  substituted (bunny) for (dog) 
 722                  'assertion: point(bunny) animal(bunny)' 
 723                  >>> conciliate('assertion: do(hop) animal(bunny)', 'assertion: animal(bunny) do(X)') 
 724                  instantiated (X) with (hop) 
 725                  'assertion: animal(bunny) do(hop)' 
 726                  >>> conciliate('assertion: do(hop) animal(bunny)', 'assertion: point(bunny)') 
 727                  '' 
 728                  """ 
 729          pm = partialmeaning 
 730          if subst == None: 
 731                  # None as in no restrictions 
 732                  subst = pm.split() 
 733          if pm == '': 
 734                  return meaning 
 735                  partialmeaning = meaning 
 736                  return True 
 737          success = False 
 738          for part in meaning.split(): 
 739                  if pred(part)+'(' in pm: 
 740                          # if there is an argument and it's not variable: 
 741                          # instantiate / substitute 
 742                          if arg(part) and not (var(arg(part)) or arg(part) in pm): 
 743                                  oldarg = tokens(pm)[tokens(pm).index(pred(part)) + 1] 
 744                                  # instantiate or substitute if allowed 
 745                                  if True in [pred(part) in a for a in subst]: 
 746                                          if oldarg[1] in uppercase: 
 747                                                  debug("instantiated", oldarg, "with", arg(part)) 
 748                                                  partialmeaning = pm = pm.replace(oldarg, arg(part)) 
 749                                          else: 
 750                                                  debug("substituted", arg(part), "for", oldarg) 
 751                                                  partialmeaning = pm = pm.replace(oldarg, arg(part)) 
 752                          success = True 
 753                          # disable this to match as much as possible: 
 754                          # return success 
 755                  #probe for variable predicate in first argument 
 756                  if pred(part) in uppercase: 
 757                          return partialmeaning 
 758                          return True 
 759                  # probe for variable predicate in second argument 
 760                  vars = [pred(a) for a in pm.split() if pred(a) in uppercase] 
 761                  # instantiate variable predicate if part is not an operator  
 762                  # and this predicate is not already present 
 763                  if vars and ':' not in part and pred(part) not in pm: 
 764                          partialmeaning = pm.replace(vars[0], pred(part)) 
 765                          debug("instantiated variable predicate with", pred(part)) 
 766                          # recurse to replace further vars 
 767                          return conciliate(meaning, partialmeaning, subst, debug) 
 768                  """ 
 769                  #this code would allow to substitute predicates if arguments match: 
 770                  elif arg(part) in pm and arg(part): 
 771                          ar = arg(part) 
 772                          oldpred = pm[pm.index(ar):pm.index(ar)+len(ar)] 
 773                          print ar 
 774                          print oldpred 
 775                          exit() 
 776                          print "substituted", pred(part), "for", oldpred 
 777                          partialmeaning[0] = pm.replace(oldpred, pred(part)) 
 778                          return True 
 779                  """ 
 780          if success: 
 781                  return partialmeaning 
 782          return '' 
 783          return success #False 
 784   
 785 -def unifies(meaning, partialmeaning, debug=dbg): 
 786          """ succeed if everything in "partialmeaning" is compatible with "meaning", 
 787                  substituting along the way. Similar to conciliate() but operates on the 
 788                  whole meaning instead of looking at individual clauses, and does not 
 789                  perform substitution, merely instantiation. 
 790   
 791                  >>> unifies('assertion: do(hop) animal(bunny)', 'assertion: do(X) animal(bunny)') 
 792                  substituted (hop) for (X) 
 793                  'assertion: do(hop) animal(bunny)' 
 794                  >>> unifies('assertion: do(hop) animal(bunny)', 'assertion: animal(bunny) do(X)') 
 795                  '' 
 796   
 797          """ 
 798          pm = partialmeaning 
 799          if pm == '': 
 800                  return meaning 
 801                  partialmeaning = meaning 
 802                  return True 
 803   
 804          for part1, part2 in zip(pm.split(), meaning.split()): 
 805                  if part1 == part2: 
 806                          continue 
 807                  elif var(arg(part1)): 
 808                          newarg = tokens(meaning)[tokens(meaning).index(pred(part2)) + 1] 
 809                          debug("substituted", newarg, "for", arg(part1)) 
 810                          partialmeaning = pm = pm.replace(arg(part1), newarg) 
 811                  else: 
 812                          # meaning and partialmeaning are in conflict 
 813                          return '' #False 
 814          return partialmeaning #True 
 815   
 816 -def getexemplars(): 
 817          """ Obtain corpus, either by importing from module `exemplars,' 
 818                  or by falling back to a small sample corpus. """ 
 819          try: 
 820                  #self-referential import... 
 821                  from exemplars import getexemplars 
 822          except ImportError: 
 823                  pass 
 824          else: 
 825                  return getexemplars() 
 826          #""" 
 827          # sample corpus: 
 828          return {'uh':                                   '', 
 829                  'eh':                                   '', 
 830                  'ah':                                   'acknowledgement', 
 831                  'ok':                                   'confirmation', 
 832                  'yes':                                  'confirmation', 
 833                  'yeah':                                 'confirmation', 
 834                  'no':                                   'denial', 
 835                  'that\'s right !'               : 'reinforcement', 
 836                  'ball'                                  : 'assertion: point(ball) toy(ball)', 
 837                  'throw it to me'                : 'imperative: throw[0](X) toy(X)', 
 838                  'you kick the football !' : 'imperative: kick(football) toy(football)', 
 839                  'what is this ?'                : 'whquestion: point(X) Y(X)', 
 840                  #should be a yes/no question, but child responded with 'cow'. perhaps 
 841                  #assertion instead? 
 842                  'is that a cow ?'               : 'whquestion: point(cow) animal(cow)', 
 843                  'what does a bunny do ?':       'whquestion: do(X) animal(bunny)', 
 844                  'what does a cow say ?' : 'whquestion: do(X) animal(cow)', 
 845                  'what does a duckie say ?': 'whquestion: do(X) animal(duck[3])', 
 846                  'what animal does woof woof ?':         'whquestion: animal(X) do(woof)', 
 847                  'what\'s a kitty say ?' : 'whquestion: do(X) animal(cat[2])', 
 848                  'that\'s a kitty'               : 'assertion: point(cat) animal(cat[2])', 
 849                  'donkey'                                : 'assertion: point(donkey) animal(donkey)', 
 850                  'that\'s a donkey'              : 'assertion: point(donkey) animal(donkey)', 
 851                  'meouw'                                 : 'assertion: do(meouw) animal(cat)', 
 852                  'quack'                                 : 'assertion: do(quack) animal(duck)', 
 853                  'moo'                                   : 'assertion: do(moo) animal(cow)', 
 854                  'bunny'                                 : 'assertion: animal(bunny) do(hop)', 
 855                  'dog'                                   : 'assertion: animal(dog) do(woof)', 
 856                  'hop'                                   : 'assertion: do(hop) animal(bunny)', 
 857                  'woof woof'                             : 'assertion: do(woof) animal(dog)', 
 858                  'want some juice ?'             : 'ynquestion: want(juice) food(juice)', 
 859                  'chocolate'                             : 'assertion: food(chocolate)', 
 860                  'what\'s inside ?'              : 'whquestion: inside(X) toy(box)', 
 861                  #child guesses correctly 
 862                  'book'                                  : 'assertion: inside(book) toy(box)', 
 863                  #child takes out book & mother confirms 
 864                  'book !'                                : 'assertion: point(book)' 
 865                  } 
 866   
 867 -def test(): 
 868          """ Interpret some sample utterances. """ 
 869          test = [ 
 870                  'what does a bunny do ?', 
 871                  'kitty do ?',  #ellipsis 
 872                  'throw the ball', 
 873                  'catch the ball', 
 874                  'where lives birdie ?',  #stumble on generalization 
 875                  "that's a ball",  #make topic 
 876                  'can you throw it ?',  #use topic 
 877                  'what does a duckie say ?', 
 878                  'what does this animal [=cow] do ?', 
 879                  'do you want to play with that [=ball] ?', 
 880                  'what does a ball say ?' # should fail 
 881                  # disabled: (chocolate / kick not in corpus) 
 882                  #'want some chocolate ?' 
 883                  #'kick the ball !', 
 884                  ] 
 885          #print evaluate('\n'.join('*MOT:  %s' % a for a in test)) 
 886          exemplars = getexemplars() 
 887          lexicon = inferlexicon(exemplars) 
 888          for a in test: 
 889                          print a 
 890                          b = interpret(a, '', exemplars, lexicon, nodbg) 
 891                          print 'interpretation:', b 
 892   
 893 -def edit_dist(source, target): 
 894          """ Edit distance of two sequences. Non-standard features are that all 
 895                  mutations have a cost of 1 (so as not to favor insertions or deletions 
 896                  over substitutions), as well as a decay based on index 
 897                  (mutations with higher indices weigh less).  
 898                   
 899                  >>> edit_dist('foo', 'bar') 
 900                  2.4890507991136213""" 
 901          def substcost(): 
 902                  if a == b: 
 903                          return 0 
 904                  # default is 2, but we should avoid a bias for insertion/deletions 
 905                  else: return 1 
 906          #decay function that weighs higher positions lower 
 907          def decay(n): 
 908                  # with a rate of 0.2 the third position in a sequence will 
 909                  # have a weight of around 0.5 (initial position weighs 1.0) 
 910                  return math.e ** (-0.2 * n) 
 911   
 912          # initialize distance matrix 
 913          # this looks better but doesn't work (tm) 
 914          #distance = [[0] * (len(target) + 1)] * (len(source) + 1) 
 915          distance = [[0] * (len(target) + 1) for b in range(len(source)+1)] 
 916          for i, row in enumerate(distance): 
 917                  row[0] = i 
 918          for j in range(len(distance[0])): 
 919                  distance[0][j] = j 
 920          distance[0][0] = 0 
 921   
 922          # populate distance matrix 
 923          for i, a in enumerate(source): 
 924                  for j, b in enumerate(target): 
 925                          insc = distance[i][j+1] + decay(i) 
 926                          substc = distance[i][j] + decay(i) * substcost() 
 927                          delc = distance[i+1][j] + decay(i) 
 928                          distance[i+1][j+1] = min((substc, insc, delc)) 
 929          return distance[-1][-1] 
 930   
 931 -def substr_iter(seq): 
 932          """ Return all substrings of a sequence, in descending order of length. 
 933                   
 934                  >>> list(substr_iter('abc')) 
 935                  ['abc', 'ab', 'bc', 'a', 'b', 'c']""" 
 936          for a in range(len(seq), -1, -1): 
 937                  for b in range(a, len(seq)): 
 938                          yield seq[b-a:b+1] 
 939   
 940 -def revdict(d): 
 941          """ Reverse dictionary, ie. swap values and keys; since a value may occur 
 942                  with multiple keys, return a list of all keys associated with a value. 
 943   
 944                  >>> revdict({0: 1, 1: 2}) 
 945                  {1: [0], 2: [1]}""" 
 946          # ignore multiple keys for a value: 
 947          # return dict(a[::-1] for a in d.items()) 
 948          return dict((a, [b for b,c in d.items() if c==a ]) for a in d.values()) 
 949   
 950 -def varclauses(m): 
 951          """ >>> varclauses(['foo(bar) zed(X)']) 
 952                  ['zed(X)']""" 
 953          return [a for a in m[0].split() if var(arg(a))] 
 954   
 955 -def pred(token): 
 956          """ >>> pred('foo(bar)') 
 957                  'foo'""" 
 958          if '(' in token: 
 959                  return token[:token.index('(')] 
 960          else: return token 
 961   
 962 -def arg(token): 
 963          """     >>> arg('foo(bar)') 
 964                  '(bar)'""" 
 965          if '(' in token: 
 966                  return token[token.index('('):] 
 967          else: return '' 
 968   
 969 -def var(arg): 
 970          """ Test whether an argument string is variable: 
 971                   
 972                  >>> var('(bar)') 
 973                  False 
 974                  >>> var('(X)') 
 975                  True 
 976          """ 
 977          return arg and (arg[0] in uppercase or arg[1] in uppercase) 
 978   
 979 -def tokens(m): 
 980          """ Turn meaning into a list of operator, predicates and arguments. 
 981                   
 982                  >>> tokens('whquestion: do(X) animal(bunny)') 
 983                  ['whquestion:', 'do', '(X)', 'animal', '(bunny)'] 
 984          """ 
 985          return m.replace('(', ' (').split() 
 986   
 987 -def intersect(a,b): 
 988          """ >>> intersect([1,2,3], [2,3]) 
 989                  set([2, 3])""" 
 990          return set(a).intersection(set(b)) 
 991   
 992  # this trick runs main when the script is executed, not when imported: 
 993  if __name__ == '__main__': 
 994          import doctest 
 995          # do doctests, but don't be pedantic about whitespace (I suspect it is the 
 996          # militant anti-tab faction who are behind this obnoxious default) 
 997          fail, attempted = doctest.testmod(verbose=False, 
 998                  optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 
 999          if attempted and not fail: 
1000                  print "%d doctests succeeded!" % attempted 
1001          main() 
1002