Module model
[hide private]
[frames] | no frames]

Source Code for Module model

   1  """ 
   2          Bachelor AI project 2009. Andreas van Cranenburgh 0440949. 
   3          Two-word stage dialogue simulator using a corpus of exemplars. 
   4   
   5          Interactive usage: 
   6          $ python model.py 
   7   
   8          Other uses: 
   9                  - Re-enact dialogue in 01.cha (should be a text file with one utterance 
  10                    per line, with each line preceeded by *MOT or *CHI, as in the CHILDES 
  11                    corpus): 
  12   
  13                    $ python 
  14  #                 >>> import model 
  15  #                 >>> model.evaluate(open('01.cha').read()) 
  16                    ... 
  17   
  18                  - Talk to self for 25 utterances: 
  19   
  20  #                 >>> model.selftalk(25) 
  21                    ... 
  22   
  23                  - Demonstrate some sample utterances that work (but not from corpus): 
  24   
  25  #                 >>> model.test() 
  26                    ... 
  27  """ 
  28   
  29  # (before continuing set tabstops to 4) 
  30  # update documentation by doing "pydoc -w model", then open "model.html" 
  31  # alternatively, use "epydoc model". 
  32   
  33  #random factoid: 
  34  #You can reference a list comprehension as it is being built by the symbol 
  35  #'_[1]'. For example, the following function unique-ifies a list of elements 
  36  #without changing their order by referencing its list comprehension. 
  37  # 
  38  #def unique(my_list): 
  39  #    return [x for x in my_list if x not in locals()['_[1]']] 
  40   
  41  try: 
  42          from free_will import choice 
  43  except ImportError: 
  44          from random import choice 
  45  from string import uppercase, lowercase 
  46  from collections import defaultdict 
  47  from pprint import pprint 
  48  #stands for pretty-print; library implementors have some strange standards 
  49  import math 
  50   
51 -def main():
52 """ Interactive textual user interface. """ 53 print "Child dialogue simulator.", 54 print "Enter parent utterance (or `quit' to quit)." 55 print 56 print "Utterances in corpus: " 57 print getexemplars().keys() 58 print 59 lexicon = inferlexicon(getexemplars(), True) 60 print 'Lexicon (distilled from corpus): ', lexicon 61 #test(getexemplars(), lexicon) 62 d = dialogue() 63 d.send(None) 64 while True: 65 print "Parent: ", 66 utt = raw_input() 67 if utt == 'quit': 68 break 69 reply = d.send(utt) 70 print "Child: ", reply
71
72 -def dbg(*m):
73 """ Print debug output (also see nodbg). 74 Takes a variable number of arguments of any type. 75 76 >>> dbg('choices', range(3)) 77 choices [0, 1, 2] 78 """ 79 print (' '.join(list(str(a).expandtabs() for a in m)))
80 -def nodbg(*m): pass
81
82 -def dialogue(exemplars={}, constructions={}, lexicon={}, debug=dbg):
83 """ Co-routine for dialogue. Use send(utterance) to receive a reply. 84 Initial input: exemplars 85 Further input: one utterance at a time 86 Output: one reply at a time""" 87 if not exemplars: 88 exemplars = getexemplars() 89 if not lexicon: 90 lexicon = inferlexicon(exemplars, False) 91 if not constructions: 92 constructions = inferconstructions(exemplars, lexicon) 93 discourse, topic, reactionutt = [('','')], '', '' 94 while True: 95 utt = yield reactionutt 96 newmeaning = interpret(utt, topic, exemplars, lexicon) 97 if 'reinforcement' in newmeaning and len(discourse) > 1: 98 reinforce(meaning, reaction, reactionutt, discourse, exemplars) 99 meaning = newmeaning 100 debug(' interpretation:', meaning) 101 102 #todo: detect correction feedback, store. DISABLED 103 if False and utt not in exemplars and meaning: 104 if meaning == lexiconcheck(utt, meaning, lexicon): 105 debug('added utterance to exemplars') 106 exemplars[utt] = meaning 107 constructions = inferconstructions(exemplars, lexicon) 108 else: 109 debug('generation check failed, not added to exemplars') 110 111 reaction = response(meaning, exemplars) 112 debug(' reaction:', reaction) 113 reactionutt = express2(reaction, exemplars, constructions, lexicon) 114 discourse.append((utt, meaning)) 115 discourse.append((reactionutt, reaction)) 116 # if the current topic is not present in the parent's utterance, 117 # see if there is a new one. otherwise there is topic continuity 118 if (topic not in meaning and topic) or not topic: 119 topic = findtopic(discourse) or ''
120
121 -def selftalk(u=25, e={}, debug=dbg):
122 """ Talk to oneself, picking a random utterance from the exemplars when 123 repetition is detected. 124 Input: number of utterances to generate, exemplars 125 Output: transcript of conversation 126 """ 127 if not e: 128 e = getexemplars() 129 a = choice(e.keys()) 130 speaking, listening = '*MOT', '*CHI' 131 said = set([a]) 132 c = [] 133 134 d = dialogue(e) 135 d.send(None) 136 137 for x in range(u): 138 c.append('%s: %s' % (speaking, a)) 139 debug(c[-1]) 140 speaking, listening = listening, speaking 141 r = d.send(a) 142 if a == r and speaking == '*MOT': 143 debug('picking random utterance') 144 # because sets are not subscriptable, it is not possible to apply 145 # choice to sets, perhaps this indicates that the axiom of choice 146 # is untenable after all?! 147 a = choice(list(set(e.keys()) - said)) 148 said.add(a) 149 else: 150 a = r 151 print '\n', '\n'.join(c)
152
153 -def evaluate(conv, exemplars={}, n=1):
154 """ When presented with a corpus fragment, feed parent utterances to model 155 and generate model responses in place of child utterances. 156 n = number of iterations to perform 157 Input: string with transcript of corpus 158 Output: same transcript with replies from model instead of child 159 """ 160 # idea: only continue for loop if model answer matches child answer? 161 # this allows reinforcement to select correct answers 162 if not exemplars: 163 exemplars = getexemplars() 164 # start & initialize co-routine 165 d = dialogue(exemplars) 166 d.send(None) 167 for a in range(n): 168 modelconv = [] 169 for b in conv.split('\n'): 170 try: 171 who, utt = b.split(':', 1) 172 except ValueError: 173 # ignore unparsable lines: 174 continue 175 if 'MOT' in who: 176 print b 177 ans = '*CHI: ' + d.send(utt) 178 print ans 179 for x in (b, ans): 180 modelconv.append(x) 181 print 182 return '\n'.join((str(a) for a in modelconv))
183
184 -def reinforce(meaning, reaction, reactionutt, discourse, exemplars):
185 """ Strengthen connection between last two utterances by adding 186 a random identifier and adding or updating the resulting exemplar. 187 Input: two utterances with meanings 188 Side-effect: updated or added exemplars 189 """ 190 randomid = "".join(choice(lowercase) for a in range(5)) 191 if discourse[-1][0] in exemplars: 192 print 'updated', 193 else: 194 print 'added', 195 exemplars[discourse[-1][0]] = meaning + ' ' + randomid 196 print 'reinforcement:', exemplars[discourse[-1][0]] 197 if reactionutt in exemplars: 198 print 'updated', 199 else: 200 print 'added', 201 print 'with', 202 exemplars[reactionutt] = reaction + ' ' + randomid 203 print exemplars[reactionutt]
204
205 -def findtopic(discourse, debug=dbg):
206 """ Look for a recurring element in the last two utterances, 207 if found, this becomes the new topic. 208 Input: discourse (list of utterances and meanings) 209 Output: clause, or None 210 """ 211 # prefer second clause, then rest. 212 def minprefertwo(seq): 213 seq = list(seq) 214 try: 215 return ((a,b) for a,b in seq if a==2).next() 216 except StopIteration: 217 return min(seq)
218 219 # take intersection of last utterance with second to last utterance 220 # (ignoring operators) 221 m1, m2 = discourse[-1][1].split(), discourse[-2][1].split() 222 common = intersect(m1[1:], m2) 223 if common: 224 # something went wrong with keeping this simple; apologies. 225 topic = minprefertwo( 226 (minprefertwo( 227 ((m1.index(a), a), 228 (m2.index(a), a))) for a in common))[1] 229 debug(' topic:', topic) 230 return topic 231
232 -def interpret(utterance, topic, exemplars, lexicon, debug=dbg):
233 """ Return semantic representation for linguistic utterance. 234 This function is in charge of backtracking over initial exemplars 235 and picking the best result, the rest is done by interpretwith() 236 Input: utterance 237 Output: meaning representation 238 """ 239 def match(a, b): 240 return len(intersect(a, b.split()))
241 242 utterance = utterance.replace('[= ', '[=') 243 words = utterance.split() 244 initial = [(match(words, a), a, b) for a,b in exemplars.items()] 245 246 # backtrack over all exemplars with any overlap at all: 247 #initial = [a[1:] for a in sorted(initial, reverse=True) if a[0]] 248 249 #only backtrack over exemplars with highest overlap (much faster): 250 n = max(initial)[0] 251 if n: 252 initial = [a[1:] for a in initial if a[0] == n] 253 else: 254 # if none of the words are in corpus we can just forget about it: 255 return '' 256 257 i = [] 258 for exutt, exmeaning in initial: 259 #print 'trying: %s' % a[1] 260 w = [b for b in words if b not in exutt] 261 n, meaning = interpretwith(w, exmeaning, exemplars, lexicon, nodbg) 262 if meaning: 263 # scoring (heuristics): 264 # 1: distance to generation check (lexicon) 265 # 2: nr of exemplars, 266 # 3: inverse of meaning length 267 # 4: word lengths of matches in initial exemplar 268 # (longer words are less likely to be 269 # unimportant noise/function words). 270 # 5: nr of substitutions, 271 # TODO heuristic: amount of pairwise overlap between exemplars used 272 # this would require book-keeping during recursion.. 273 i.append(( 274 n, 275 edit_dist(lexiconcheck(utterance, meaning, lexicon), meaning), 276 1.0 / len(tokens(meaning)), 277 #edit_dist(exmeaning, meaning), 278 #1.0 / sum(len(a) for a in (b for b in words if b in exutt)), 279 #end of heuristics. initial exemplar: 280 (exutt, exmeaning))) 281 #to debug choice of initial exemplar, enable this to see all candidates: 282 #pprint(sorted(i)) 283 if i: 284 # find lowest score: 285 a = min(i)[-1] 286 debug('initial exemplar:') 287 debug('\t', a) 288 289 # try to merge topic (but don't care if it fails, avoid substition): 290 c = conciliate(topic, a[1], varclauses(b)) 291 if c: 292 b = c 293 else: 294 b = a[1] 295 296 # do interpretation again, this time with debug output: 297 w = [c for c in words if c not in a[0].split()] 298 debug('words left:', repr(' '.join(w))) 299 if w: 300 return interpretwith(w, b, exemplars, lexicon, debug)[1] 301 #else: no words left to interpret 302 return b 303 else: 304 return '' 305
306 -def lexiconcheck(utt, meaning, lex):
307 # generation check, filter interpretation 308 # by that which is present according to lexicon. 309 # disabled because the lexicon is too incomplete for this. 310 def elim(a): 311 return a.replace('[=%s]' % a[2:-1], a[2:-1]).strip().lower()
312 revlex = revdict(lex) 313 # map utterance to lexicon items 314 uttlex = " ".join(lex[elim(b)] for b in utt.split() if elim(b) in lex) 315 # reconstruct interpretation according to own knowledge 316 # select those clauses that occur in reverse lexicon 317 # or in utterance --> lexicon mapping 318 # don't be picky with speech acts 319 def check(a): 320 return a not in revlex or a in uttlex or a[-1] == ':' 321 return " ".join((a for a in meaning.split() if check(a))) 322 323
324 -def interpretwith(words, partialmeaning, exemplars, lexicon, debug=dbg):
325 """ Interpretation helper function called by interpret(), work out meaning 326 of remaining words by stitching together the best matching fragments. 327 """ 328 l = len(words) 329 330 # lexicon check is too permissive 331 #if words[0] not in lexicon: 332 # skip over words not in corpus 333 if words and words[0][0] != '[': 334 if words[0] not in ' '.join(exemplars.keys()).split(): 335 debug("skipping", repr(words[0])) 336 words = words[1:] 337 338 # handle demonstratives 339 # eg. "this [=ball]" should add the meaning of "ball" 340 if partialmeaning and [w for w in words if '[' in w]: 341 word = [w for w in words if '[' in w][0][2:-1].strip().lower() 342 # TODO multi-word, eg. ice cream ... 343 if word in lexicon: 344 pm = conciliate(lexicon[word], partialmeaning, None, debug) 345 if pm: 346 partialmeaning = pm 347 debug("demonstrative dereferenced:", partialmeaning) 348 else: 349 # fail! 350 return 0, '' 351 partialmeaning[0] += ' ' + lexicon[word] 352 debug("appended demonstrative:", partialmeaning) 353 words = [a for a in words if a[0] != '['] 354 355 for sub in substr_iter(words): 356 substring = " ".join(sub) 357 for b in exemplars: 358 # check whether the substring occurs and match whole words only 359 if substring in b and set(sub).issubset(b.split()): 360 pm = partialmeaning 361 # turn on substition for known words 362 subst = [lexicon[word] for word in sub if word in lexicon] 363 pm = conciliate(exemplars[b], partialmeaning, subst + [''], debug) 364 if pm: 365 # remove all words in exemplar from queue 366 #if c in b.split() 367 matches = [c for c in words if c in sub] 368 words = [c for c in words if c not in sub] 369 debug(' ', repr(' '.join(matches)), 'in', repr(b)) 370 debug(' and', repr(exemplars[b])) 371 debug(' matches', repr(pm)) 372 #recurse: 373 m, p = interpretwith(words, pm, exemplars, lexicon, debug) 374 return 1 + m, p 375 # no match yet? 376 if words: 377 return 0, '' #partialmeaning[0] 378 return 0, partialmeaning
379
380 -def response(meaning, exemplars, debug=dbg):
381 """ Transform a meaning into a response using adjacency pairs of speech 382 acts. 383 Input: meaning representation 384 Output: meaning representation 385 """ 386 def vars(m): 387 return len([a for a in tokens(m) if var(a)])
388 if 'imperative' in meaning: 389 # refusal doesn't occur in corpus, so disabled: 390 #op = choice(('acknowledgement: ', 'refusal: ')) 391 return 'acknowledgement ' + meaning.split(':')[1] 392 elif 'ynquestion' in meaning: 393 op = choice(('agreement', 'denial')) 394 return op 395 elif 'whquestion' in meaning: 396 if not ':' in meaning: 397 return '' 398 m = 'assertion:' + meaning.split(':', 1)[1] 399 #d = [a for a in exemplars.values() if conciliate(a, [m])] 400 # edit distance on clauses, plus a penalty for variables 401 # (this penalty implements a bias for definite information): 402 def score(m, a): 403 return edit_dist(tokens(m), tokens(a)) + 0.5*vars(a) 404 d = [(score(m, a), a) for a in exemplars.values()] 405 #print sorted(d) 406 if min(d)[0] < 1: 407 #choices = [a[1] for a in d if a[0] == min(d)[0]] 408 #max variance to be eligible is twice the minimal distance 409 choices = [a[1] for a in d if a[0] < 1] 410 debug("possible reactions:", sorted(choices)) 411 if choices: 412 r = choice(choices) 413 else: 414 return '' 415 m = conciliate(r, m, varclauses(m)) 416 if m: 417 n = unifies(r, m) 418 if n: 419 if len(choices) > 1: 420 debug('choice:', m) 421 return n 422 if len(choices) > 1: 423 debug('choice:', m) 424 return m 425 else: 426 return m #'' 427 #here we can decide to take spontaneous initiative (eg. "more juice!") 428 elif 'assertion' in meaning: 429 #re-phrase as two word utterance 430 return meaning 431 #return 'acknowledgement' 432 433 # here we could express confusion 434 return '' 435
436 -def express(meaning, exemplars, lexicon):
437 """ Express `meaning' by returning the most similar exemplar. """ 438 if meaning in exemplars.values(): 439 return choice(revdict(exemplars)[meaning]) 440 else: 441 #find approximation 442 # edit distance on clauses: 443 def dist(m, a): 444 return edit_dist(m.split(), a.split())
445 d = [(dist(meaning, b), a) for a, b in exemplars.items()] 446 if min(d)[0] < len(meaning): 447 return choice([a for a in d if a[0] == min(d)[0]])[1] 448 else: 449 return '0' 450
451 -def expressmulti(meaning, exemplars, constructions, lexicon):
452 """ Express `meaning' by returning a matching exemplar or construction """ 453 # exemplar filtered according to constructions 454 def check(a): 455 return a in ' '.join(constructions.keys())
456 if meaning in exemplars: 457 return ' '.join(a for a in exemplars[meaning] if check(a)) 458 # try constructions 459 if meaning in constructions.values(): 460 return choice(revdict(constructions)[meaning]) 461 # this sounds a little poetic: 462 if any(meaning in clauses for clauses in constructions.values()): 463 return choice([utt for utt, clauses in constructions.items() 464 if meaning in clauses]) 465 if any(clauses in meaning for clauses in constructions.values()): 466 return choice([utt for utt, clauses in constructions.items() 467 if clauses in meaning]) 468 # fall back on two word stage 469 return express2(meaning, exemplars, constructions, lexicon) 470
471 -def express2(meaning, exemplars, constructions, lexicon, debug=dbg):
472 """ Transform a meaning into a two word utterance using exemplars, 473 constructions or the lexicon. Filter result according to lexical 474 knowledge. 475 Input: meaning representation 476 Output: one or two word utterance 477 """ 478 # filter out words not in lexicon 479 def reduce(utt): 480 r=' '.join(a for a in utt.split() if a.lower() in lexicon) 481 if r == None: 482 return '' 483 return r
484 485 # first try to find a matching exemplar 486 if meaning in exemplars.values(): 487 utt = choice(revdict(exemplars)[meaning]) 488 debug('reduced:', utt) 489 return reduce(utt) 490 491 # try constructions 492 if meaning in constructions.values(): 493 utt = choice(revdict(constructions)[meaning]) 494 debug('reduced:', utt) 495 return reduce(utt) 496 # this sounds a little poetic: 497 if any(meaning in clauses for clauses in constructions.values()): 498 utt = choice([utt for utt, clauses in constructions.items() 499 if meaning in clauses]) 500 debug('reduced:', utt) 501 return reduce(utt) 502 503 # if that fails: 504 # use lexicon to fetch possible words, pick two and string together 505 revlex = revdict(lexicon) 506 if ':' in meaning: 507 # might want to do something with operators as well? 508 meaning = meaning.split() 509 else: 510 # operator without further clauses 511 if meaning in revlex: 512 return revlex[meaning] 513 else: 514 return '' 515 if not arg(meaning[1]) or arg(meaning[1])[1] not in uppercase: 516 clause = meaning[1] 517 else: 518 return '' 519 debug("trying to express:", clause) 520 521 # express first clause if in lexicon: 522 for a in meaning[1:]: 523 if a in revlex: 524 return choice(revlex[a]) 525 if clause in revlex: 526 return revlex[clause].pop() 527 528 # compose from lexicon using predicate and argument 529 topic = arg(meaning[1]) 530 if topic and topic[0] in uppercase: 531 if len(meaning) > 2: 532 topic = pred(meaning[2]) 533 else: 534 topic = meaning[0] 535 comment = pred(meaning[1]) 536 537 candidates = [a for a,b in lexicon.items() if comment in b] 538 if candidates: 539 utt = choice([a for a,b in lexicon.items() if comment in b]) 540 debug('comment:', comment, 'in', utt) 541 else: 542 utt = '' 543 544 # decide whether to express topic. TODO: think of non-random way to decide 545 if choice([0, 1]): 546 return utt 547 548 candidates = [a for a,b in lexicon.items() if topic in b and utt not in a] 549 if candidates: 550 a = choice(candidates) 551 utt = a + ' ' + utt 552 debug('topic:', topic, 'in', utt) 553 else: 554 debug('topic not expressible?', topic) 555 return utt 556
557 -def inferconstructions(exemplars, lexicon, constructions={}, debug=dbg):
558 """ Build corpus of constructions from exemplars and lexicon. 559 Input: exemplars & lexicon. 560 Output: constructions, dictionary of pairings between substrings 561 and clauses. """ 562 debug('distilling constructions') 563 # take substr_iter of all sentences, chain these 564 scores = defaultdict(int) 565 n = len(constructions) 566 for utt, meaning in exemplars.items(): 567 for a in substr_iter(utt.split()): 568 # constructions consist of two or more words (lexicon otherwise): 569 if len(a) == 1: 570 break 571 # filter out non-alphabetic characters 572 words = ' '.join(b.lower() for b in a if all(c.lower() in lowercase for c in b)) 573 if ' ' not in words: 574 continue 575 # increment count in constructions if this one is unknown 576 if not any(words in b for b in constructions): 577 score = len([c for c in exemplars if words in c]) 578 if score: 579 scores[words] = score 580 # shouldn't this be a priority queue instead? 581 # iterate over counts, take highest counts and pick longest constructions 582 for score, form in sorted(revdict(scores).items()): 583 form = form[0] 584 if form not in scores: 585 # skip this construction if it has been removed meanwhile 586 continue 587 for a in scores.keys(): 588 # remove shorter constructions 589 if a in form and form != a: 590 scores.pop(a) 591 # longer constructions are OK 592 # however: this means that the shorter one should abstract! 593 # use lexicon to check presence of words. 594 595 # replace score with meaning, intersect like inferlexicon 596 meaning = findmeaning(form, exemplars, lexicon, dbg) 597 if meaning: 598 debug('construction (score %d)' % score, repr(form), repr(meaning)) 599 constructions[form] = meaning 600 else: 601 ms = [meaning for utt, meaning in exemplars.items() if form in utt] 602 score = '(score %d)' % score 603 candidate = 'not found in candidates %s', str(ms) 604 debug('meaning of construction', repr(form), score, candidate) 605 # if not found the next pass might find it by elimination 606 607 #print sorted(revdict(scores).items()) 608 # base case (no change) 609 if n == len(constructions): 610 return constructions 611 # recurse 612 return inferconstructions(exemplars, lexicon)
613
614 -def findmeaning(form, exemplars, lexicon, debug=dbg):
615 """ Given the words in a construction, find the most common meaning 616 associated with it in the corpus of exemplars. """ 617 ms = [meaning for utt, meaning in exemplars.items() if form in utt] 618 #print ms 619 #ehh, why this whole intersection business? 620 revlex = revdict(lexicon) 621 if len(ms) > 1: 622 m = [a.split() for a in ms] 623 m = reduce(intersect, m[1:], m[0]) 624 #print m 625 if m: 626 n = [] 627 # if some word for a clause is not present in construction, 628 # abstract its meaning 629 def abstract(a): 630 if a in revlex and any(b in form for b in revlex[a]): 631 return a 632 else: 633 if a[0] == '(': 634 return '(X)' 635 #abstract operators 636 return a #''
637 return ' '.join(abstract(a) for a in m) 638 else: 639 # prefer most frequent sub-interpretation (allow non-contiguous?) 640 # --> combinations instead 641 return max((ms.count(a), a) for a in ms)[1] 642 scores = defaultdict(int) 643 for meaning in ms: 644 for a in substr_iter(meaning.split()): 645 m = ' '.join(a) 646 # increment count in constructions if this one is unknown 647 scores[m] += 1 648 return max(revdict(scores))[1][0] 649 #elif len(m) > 1: 650 # # if multiple possibilities arise, prefer least frequent predicate 651 # def freq(a): 652 # return ' '.join(exemplars.values()).count(pred(a)) 653 # return ' '.join(min((freq(a), a) for a in m)[1]) 654 if ms: 655 return ms.pop() 656 657 # a mutable default argument is essentially a static variable (yay for arcana!)
658 -def inferlexicon(exemplars, verbose=False, lexicon={}, debug=dbg):
659 """ Infer lexicon from corpus of exemplars. 660 Input: exemplars. 661 Output: lexicon, dictionary of word-clause pairings. """ 662 exlcase = dict((a.lower(), b) for a,b in exemplars.items()) 663 n = len(lexicon) 664 # look for word indices linking meanings to words: 665 for utt,m in exemplars.items(): 666 if '[' in m: 667 n = int(m[m.index('[')+1 : m.index(']')]) 668 clause = m[:m.index('[')].split()[-1] 669 clause += m[m.index(']')+1:].split()[0] 670 word = utt.split()[n].lower() 671 exemplars[utt] = m[:m.index('[')] + m[m.index(']')+1:] 672 if word not in lexicon: 673 lexicon[word] = clause 674 for word in set(" ".join(exemplars.keys()).split()) - set(lexicon.keys()): 675 word = word.lower() 676 utterances = [a.split() for a in exlcase.keys() if word in a.split()] 677 meanings = [b.split() for a,b in exlcase.items() if word in a.split()] 678 679 # reduce meanings by matching words with meanings: 680 for a in list(reduce(intersect, meanings, meanings[0])): #[::-1]: 681 if word in a: 682 if len(a) == 1: 683 lexicon[word] = a 684 else: 685 # select least frequent predicate (assumption is that 686 # this predicate is most specific to this word) 687 def freq(b): 688 return ' '.join(exemplars.values()).count(pred(b))
689 candidates = ((freq(b), b) for b in a.split()) 690 lexicon[word] = min(candidates)[1] 691 break 692 # if this didn't succeed and the set of exemplars which contain the 693 # word reduces to a single meaning, use this: 694 #DISABLED 695 if False and word not in lexicon: 696 a = reduce(intersect, utterances, utterances[0]) 697 if len(a - set(lexicon.keys())) == 1: 698 lexicon[word] = reduce(intersect, meanings, meanings[0]) 699 if len(lexicon[word]) == 0: 700 lexicon[word] = '' 701 else: 702 lexicon[word] = lexicon[word].pop() 703 # a single word utterance carrying a single meaning: 704 if len(utterances) == len(meanings) == len(utterances[0]): 705 if len(utterances) == len(meanings[0]) == 1: 706 lexicon[word] = meanings[0][0] 707 if verbose: 708 notfound = set(' '.join(exlcase.keys()).split()) - set(lexicon.keys()) 709 debug('lexicon: not found:', notfound, '\n') 710 # recurse as long as new meanings are inferred: 711 if len(lexicon) == n: 712 return inferlexicon(exemplars, verbose) 713 return lexicon 714
715 -def conciliate(meaning, partialmeaning, subst=None, debug=dbg):
716 """ Test whether meaning can be made comptabible with partialmeaning, 717 by looking for a family resemblance with partialmeanings, 718 if found, perform a substitution if necessary. Returns empty string on failure. 719 720 >>> conciliate('question: animal(bunny) do(X)', 'assertion: point(dog) animal(dog)') 721 substituted (bunny) for (dog) 722 'assertion: point(bunny) animal(bunny)' 723 >>> conciliate('assertion: do(hop) animal(bunny)', 'assertion: animal(bunny) do(X)') 724 instantiated (X) with (hop) 725 'assertion: animal(bunny) do(hop)' 726 >>> conciliate('assertion: do(hop) animal(bunny)', 'assertion: point(bunny)') 727 '' 728 """ 729 pm = partialmeaning 730 if subst == None: 731 # None as in no restrictions 732 subst = pm.split() 733 if pm == '': 734 return meaning 735 partialmeaning = meaning 736 return True 737 success = False 738 for part in meaning.split(): 739 if pred(part)+'(' in pm: 740 # if there is an argument and it's not variable: 741 # instantiate / substitute 742 if arg(part) and not (var(arg(part)) or arg(part) in pm): 743 oldarg = tokens(pm)[tokens(pm).index(pred(part)) + 1] 744 # instantiate or substitute if allowed 745 if True in [pred(part) in a for a in subst]: 746 if oldarg[1] in uppercase: 747 debug("instantiated", oldarg, "with", arg(part)) 748 partialmeaning = pm = pm.replace(oldarg, arg(part)) 749 else: 750 debug("substituted", arg(part), "for", oldarg) 751 partialmeaning = pm = pm.replace(oldarg, arg(part)) 752 success = True 753 # disable this to match as much as possible: 754 # return success 755 #probe for variable predicate in first argument 756 if pred(part) in uppercase: 757 return partialmeaning 758 return True 759 # probe for variable predicate in second argument 760 vars = [pred(a) for a in pm.split() if pred(a) in uppercase] 761 # instantiate variable predicate if part is not an operator 762 # and this predicate is not already present 763 if vars and ':' not in part and pred(part) not in pm: 764 partialmeaning = pm.replace(vars[0], pred(part)) 765 debug("instantiated variable predicate with", pred(part)) 766 # recurse to replace further vars 767 return conciliate(meaning, partialmeaning, subst, debug) 768 """ 769 #this code would allow to substitute predicates if arguments match: 770 elif arg(part) in pm and arg(part): 771 ar = arg(part) 772 oldpred = pm[pm.index(ar):pm.index(ar)+len(ar)] 773 print ar 774 print oldpred 775 exit() 776 print "substituted", pred(part), "for", oldpred 777 partialmeaning[0] = pm.replace(oldpred, pred(part)) 778 return True 779 """ 780 if success: 781 return partialmeaning 782 return '' 783 return success #False
784
785 -def unifies(meaning, partialmeaning, debug=dbg):
786 """ succeed if everything in "partialmeaning" is compatible with "meaning", 787 substituting along the way. Similar to conciliate() but operates on the 788 whole meaning instead of looking at individual clauses, and does not 789 perform substitution, merely instantiation. 790 791 >>> unifies('assertion: do(hop) animal(bunny)', 'assertion: do(X) animal(bunny)') 792 substituted (hop) for (X) 793 'assertion: do(hop) animal(bunny)' 794 >>> unifies('assertion: do(hop) animal(bunny)', 'assertion: animal(bunny) do(X)') 795 '' 796 797 """ 798 pm = partialmeaning 799 if pm == '': 800 return meaning 801 partialmeaning = meaning 802 return True 803 804 for part1, part2 in zip(pm.split(), meaning.split()): 805 if part1 == part2: 806 continue 807 elif var(arg(part1)): 808 newarg = tokens(meaning)[tokens(meaning).index(pred(part2)) + 1] 809 debug("substituted", newarg, "for", arg(part1)) 810 partialmeaning = pm = pm.replace(arg(part1), newarg) 811 else: 812 # meaning and partialmeaning are in conflict 813 return '' #False 814 return partialmeaning #True
815
816 -def getexemplars():
817 """ Obtain corpus, either by importing from module `exemplars,' 818 or by falling back to a small sample corpus. """ 819 try: 820 #self-referential import... 821 from exemplars import getexemplars 822 except ImportError: 823 pass 824 else: 825 return getexemplars() 826 #""" 827 # sample corpus: 828 return {'uh': '', 829 'eh': '', 830 'ah': 'acknowledgement', 831 'ok': 'confirmation', 832 'yes': 'confirmation', 833 'yeah': 'confirmation', 834 'no': 'denial', 835 'that\'s right !' : 'reinforcement', 836 'ball' : 'assertion: point(ball) toy(ball)', 837 'throw it to me' : 'imperative: throw[0](X) toy(X)', 838 'you kick the football !' : 'imperative: kick(football) toy(football)', 839 'what is this ?' : 'whquestion: point(X) Y(X)', 840 #should be a yes/no question, but child responded with 'cow'. perhaps 841 #assertion instead? 842 'is that a cow ?' : 'whquestion: point(cow) animal(cow)', 843 'what does a bunny do ?': 'whquestion: do(X) animal(bunny)', 844 'what does a cow say ?' : 'whquestion: do(X) animal(cow)', 845 'what does a duckie say ?': 'whquestion: do(X) animal(duck[3])', 846 'what animal does woof woof ?': 'whquestion: animal(X) do(woof)', 847 'what\'s a kitty say ?' : 'whquestion: do(X) animal(cat[2])', 848 'that\'s a kitty' : 'assertion: point(cat) animal(cat[2])', 849 'donkey' : 'assertion: point(donkey) animal(donkey)', 850 'that\'s a donkey' : 'assertion: point(donkey) animal(donkey)', 851 'meouw' : 'assertion: do(meouw) animal(cat)', 852 'quack' : 'assertion: do(quack) animal(duck)', 853 'moo' : 'assertion: do(moo) animal(cow)', 854 'bunny' : 'assertion: animal(bunny) do(hop)', 855 'dog' : 'assertion: animal(dog) do(woof)', 856 'hop' : 'assertion: do(hop) animal(bunny)', 857 'woof woof' : 'assertion: do(woof) animal(dog)', 858 'want some juice ?' : 'ynquestion: want(juice) food(juice)', 859 'chocolate' : 'assertion: food(chocolate)', 860 'what\'s inside ?' : 'whquestion: inside(X) toy(box)', 861 #child guesses correctly 862 'book' : 'assertion: inside(book) toy(box)', 863 #child takes out book & mother confirms 864 'book !' : 'assertion: point(book)' 865 }
866
867 -def test():
868 """ Interpret some sample utterances. """ 869 test = [ 870 'what does a bunny do ?', 871 'kitty do ?', #ellipsis 872 'throw the ball', 873 'catch the ball', 874 'where lives birdie ?', #stumble on generalization 875 "that's a ball", #make topic 876 'can you throw it ?', #use topic 877 'what does a duckie say ?', 878 'what does this animal [=cow] do ?', 879 'do you want to play with that [=ball] ?', 880 'what does a ball say ?' # should fail 881 # disabled: (chocolate / kick not in corpus) 882 #'want some chocolate ?' 883 #'kick the ball !', 884 ] 885 #print evaluate('\n'.join('*MOT: %s' % a for a in test)) 886 exemplars = getexemplars() 887 lexicon = inferlexicon(exemplars) 888 for a in test: 889 print a 890 b = interpret(a, '', exemplars, lexicon, nodbg) 891 print 'interpretation:', b
892
893 -def edit_dist(source, target):
894 """ Edit distance of two sequences. Non-standard features are that all 895 mutations have a cost of 1 (so as not to favor insertions or deletions 896 over substitutions), as well as a decay based on index 897 (mutations with higher indices weigh less). 898 899 >>> edit_dist('foo', 'bar') 900 2.4890507991136213""" 901 def substcost(): 902 if a == b: 903 return 0 904 # default is 2, but we should avoid a bias for insertion/deletions 905 else: return 1
906 #decay function that weighs higher positions lower 907 def decay(n): 908 # with a rate of 0.2 the third position in a sequence will 909 # have a weight of around 0.5 (initial position weighs 1.0) 910 return math.e ** (-0.2 * n) 911 912 # initialize distance matrix 913 # this looks better but doesn't work (tm) 914 #distance = [[0] * (len(target) + 1)] * (len(source) + 1) 915 distance = [[0] * (len(target) + 1) for b in range(len(source)+1)] 916 for i, row in enumerate(distance): 917 row[0] = i 918 for j in range(len(distance[0])): 919 distance[0][j] = j 920 distance[0][0] = 0 921 922 # populate distance matrix 923 for i, a in enumerate(source): 924 for j, b in enumerate(target): 925 insc = distance[i][j+1] + decay(i) 926 substc = distance[i][j] + decay(i) * substcost() 927 delc = distance[i+1][j] + decay(i) 928 distance[i+1][j+1] = min((substc, insc, delc)) 929 return distance[-1][-1] 930
931 -def substr_iter(seq):
932 """ Return all substrings of a sequence, in descending order of length. 933 934 >>> list(substr_iter('abc')) 935 ['abc', 'ab', 'bc', 'a', 'b', 'c']""" 936 for a in range(len(seq), -1, -1): 937 for b in range(a, len(seq)): 938 yield seq[b-a:b+1]
939
940 -def revdict(d):
941 """ Reverse dictionary, ie. swap values and keys; since a value may occur 942 with multiple keys, return a list of all keys associated with a value. 943 944 >>> revdict({0: 1, 1: 2}) 945 {1: [0], 2: [1]}""" 946 # ignore multiple keys for a value: 947 # return dict(a[::-1] for a in d.items()) 948 return dict((a, [b for b,c in d.items() if c==a ]) for a in d.values())
949
950 -def varclauses(m):
951 """ >>> varclauses(['foo(bar) zed(X)']) 952 ['zed(X)']""" 953 return [a for a in m[0].split() if var(arg(a))]
954
955 -def pred(token):
956 """ >>> pred('foo(bar)') 957 'foo'""" 958 if '(' in token: 959 return token[:token.index('(')] 960 else: return token
961
962 -def arg(token):
963 """ >>> arg('foo(bar)') 964 '(bar)'""" 965 if '(' in token: 966 return token[token.index('('):] 967 else: return ''
968
969 -def var(arg):
970 """ Test whether an argument string is variable: 971 972 >>> var('(bar)') 973 False 974 >>> var('(X)') 975 True 976 """ 977 return arg and (arg[0] in uppercase or arg[1] in uppercase)
978
979 -def tokens(m):
980 """ Turn meaning into a list of operator, predicates and arguments. 981 982 >>> tokens('whquestion: do(X) animal(bunny)') 983 ['whquestion:', 'do', '(X)', 'animal', '(bunny)'] 984 """ 985 return m.replace('(', ' (').split()
986
987 -def intersect(a,b):
988 """ >>> intersect([1,2,3], [2,3]) 989 set([2, 3])""" 990 return set(a).intersection(set(b))
991 992 # this trick runs main when the script is executed, not when imported: 993 if __name__ == '__main__': 994 import doctest 995 # do doctests, but don't be pedantic about whitespace (I suspect it is the 996 # militant anti-tab faction who are behind this obnoxious default) 997 fail, attempted = doctest.testmod(verbose=False, 998 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 999 if attempted and not fail: 1000 print "%d doctests succeeded!" % attempted 1001 main() 1002