1: # 375 "./lpsrc/inria_re.ipk"
2: (***********************************************************************)
3: (* *)
4: (* Objective Caml *)
5: (* *)
6: (* Xavier Leroy, projet Cristal, *)
7: (* Luc Maranget, projet Moscova, *)
8: (* INRIA Rocquencourt *)
9: (* *)
10: (* Copyright 1996 Institut National de Recherche en Informatique et *)
11: (* en Automatique. All rights reserved. This file is distributed *)
12: (* under the terms of the Q Public License version 1.0. *)
13: (* *)
14: (***********************************************************************)
15:
16: (* $Id: inria_re.ipk,v 1.2 2005/10/13 21:14:32 skaller Exp $ *)
17:
18: (* Compiling a lexer definition *)
19:
20: open Inria_syntax
21: open Printf
22:
23: exception Memory_overflow
24:
25: (* Deep abstract syntax for regular expressions *)
26:
27: type tag_info = {id : string ; start : bool ; action : int}
28:
29: type regexp =
30: Empty
31: | Chars of int * bool
32: | Action of int
33: | Tag of tag_info
34: | Seq of regexp * regexp
35: | Alt of regexp * regexp
36: | Star of regexp
37:
38: type tag_base = Start | End | Mem of int
39: type tag_addr = Sum of (tag_base * int)
40: type ident_info =
41: | Ident_string of bool * tag_addr * tag_addr
42: | Ident_char of bool * tag_addr
43: type t_env = (string * ident_info) list
44:
45: type ('args,'action) lexer_entry =
46: { lex_name: string;
47: lex_regexp: regexp;
48: lex_mem_tags: int ;
49: lex_actions: (int * t_env * 'action) list }
50:
51:
52: type automata =
53: Perform of int * tag_action list
54: | Shift of automata_trans * (automata_move * memory_action list) array
55:
56: and automata_trans =
57: No_remember
58: | Remember of int * tag_action list
59:
60: and automata_move =
61: Backtrack
62: | Goto of int
63:
64: and memory_action =
65: | Copy of int * int
66: | Set of int
67:
68: and tag_action = SetTag of int * int | EraseTag of int
69:
70: let string_of_automata_move x = match x with
71: | Backtrack -> "backtrack"
72: | Goto i -> "goto " ^ string_of_int i
73:
74: let string_of_tag_base x = match x with
75: | Start -> "Start"
76: | Mem j -> "Mem " ^ string_of_int j
77: | End -> "End"
78:
79: let string_of_tag_addr x = match x with Sum (tb,i) ->
80: string_of_tag_base tb ^ " " ^ string_of_int i
81:
82: let string_of_ident_info x = match x with
83: | Ident_string (b,t1,t2) ->
84: "string " ^
85: if b then "true " else "false " ^
86: "(" ^ string_of_tag_addr t1 ^ ", " ^ string_of_tag_addr t2 ^ ")"
87: | Ident_char (b,t1) ->
88: "char " ^
89: if b then "true " else "false " ^
90: "(" ^ string_of_tag_addr t1 ^ ")"
91:
92: let string_of_tenv iidis =
93: if List.length iidis = 0 then "none" else
94: (
95: String.concat ";\n"
96: (
97: List.map
98: (fun (s,idi) -> " var " ^ s ^ ":" ^string_of_ident_info idi)
99: iidis
100: )
101: ) ^ "\n"
102:
103: (* Representation of entry points *)
104: let string_of_tag_action x = match x with
105: | SetTag (i,j) -> "set " ^ string_of_int i ^ " " ^ string_of_int j
106: | EraseTag i -> "erase " ^ string_of_int i
107:
108: let string_of_memory_action x = match x with
109: | Copy(i,j) -> "copy " ^ string_of_int i ^ " " ^ string_of_int j
110: | Set i -> "set " ^ string_of_int i
111:
112: let string_of_tag_action_list x =
113: String.concat ", " (List.map string_of_tag_action x)
114:
115: let string_of_automata_trans x = match x with
116: | No_remember -> "No_remember"
117: | Remember (i,tal) -> "Remember " ^ string_of_int i ^ " "^
118: string_of_tag_action_list tal
119:
120: let string_of_automata a = match a with
121: | Perform (i,tal) -> "perform " ^ string_of_int i ^ " " ^
122: string_of_tag_action_list tal
123:
124: | Shift (at,vect) -> "shift " ^ string_of_automata_trans at ^
125: "\n" ^
126: (
127: let s = ref "" in
128: for i = 0 to Array.length vect - 1 do
129: let move,macts = vect.(i) in
130: if move=Backtrack && List.length macts = 0 then () else
131: s:= !s ^ "Char " ^ string_of_int i ^ " " ^
132: "move="^ string_of_automata_move move ^
133: ", macts= " ^
134: (
135: if List.length macts = 0 then "none" else
136: String.concat ", "
137: (List.map string_of_memory_action macts)
138: ) ^ "\n"
139: done;
140: !s
141: )
142:
143: type ('args,'action) automata_entry =
144: { auto_name: string;
145: auto_args: 'args ;
146: auto_mem_size : int ;
147: auto_initial_state: int * memory_action list;
148: auto_actions: (int * t_env * 'action) list }
149:
150: let string_of_automata_entry {
151: auto_name = auto_name;
152: auto_mem_size = size ;
153: auto_initial_state = (i,mal);
154: auto_actions = itenvas
155: } =
156: auto_name ^ " size=" ^ string_of_int size ^
157: ", init_state=" ^ string_of_int i ^ "\nmemacts=" ^
158: (if List.length mal = 0 then "none" else
159: String.concat "," (List.map (fun ma -> string_of_memory_action ma) mal)
160: ) ^ "\n" ^
161: "actions=\n" ^
162: String.concat "" (List.map (fun (i,te,_) ->
163: "action " ^ string_of_int i ^ "\n " ^ string_of_tenv te
164: ) itenvas)
165:
166:
167: (* A lot of sets and map structures *)
168:
169: module Ints = Set.Make(struct type t = int let compare = compare end)
170:
171: module Tags = Set.Make(struct type t = tag_info let compare = compare end)
172:
173: module TagMap =
174: Map.Make (struct type t = tag_info let compare = compare end)
175:
176: module StringSet =
177: Set.Make (struct type t = string let compare = Pervasives.compare end)
178: module StringMap =
179: Map.Make (struct type t = string let compare = Pervasives.compare end)
180:
181: (*********************)
182: (* Variable cleaning *)
183: (*********************)
184:
185: (* Silently eliminate nested variables *)
186:
187: let rec do_remove_nested to_remove = function
188: | Bind (e,x) ->
189: if StringSet.mem x to_remove then
190: do_remove_nested to_remove e
191: else
192: Bind (do_remove_nested (StringSet.add x to_remove) e, x)
193: | Epsilon|Eof|Characters _ as e -> e
194: | Sequence (e1, e2) ->
195: Sequence
196: (do_remove_nested to_remove e1, do_remove_nested to_remove e2)
197: | Alternative (e1, e2) ->
198: Alternative
199: (do_remove_nested to_remove e1, do_remove_nested to_remove e2)
200: | Repetition e ->
201: Repetition (do_remove_nested to_remove e)
202:
203: let remove_nested_as e = do_remove_nested StringSet.empty e
204:
205: (*********************)
206: (* Variable analysis *)
207: (*********************)
208:
209: (*
210: Optional variables.
211: A variable is optional when matching of regexp does not
212: implies it binds.
213: The typical case is:
214: ("" | 'a' as x) -> optional
215: ("" as x | 'a' as x) -> non-optional
216: *)
217:
218: let stringset_delta s1 s2 =
219: StringSet.union
220: (StringSet.diff s1 s2)
221: (StringSet.diff s2 s1)
222:
223: let rec find_all_vars = function
224: | Characters _|Epsilon|Eof ->
225: StringSet.empty
226: | Bind (e,x) ->
227: StringSet.add x (find_all_vars e)
228: | Sequence (e1,e2)|Alternative (e1,e2) ->
229: StringSet.union (find_all_vars e1) (find_all_vars e2)
230: | Repetition e -> find_all_vars e
231:
232:
233: let rec do_find_opt = function
234: | Characters _|Epsilon|Eof -> StringSet.empty, StringSet.empty
235: | Bind (e,x) ->
236: let opt,all = do_find_opt e in
237: opt, StringSet.add x all
238: | Sequence (e1,e2) ->
239: let opt1,all1 = do_find_opt e1
240: and opt2,all2 = do_find_opt e2 in
241: StringSet.union opt1 opt2, StringSet.union all1 all2
242: | Alternative (e1,e2) ->
243: let opt1,all1 = do_find_opt e1
244: and opt2,all2 = do_find_opt e2 in
245: StringSet.union
246: (StringSet.union opt1 opt2)
247: (stringset_delta all1 all2),
248: StringSet.union all1 all2
249: | Repetition e ->
250: let r = find_all_vars e in
251: r,r
252:
253: let find_optional e =
254: let r,_ = do_find_opt e in r
255:
256: (*
257: Double variables
258: A variable is double when it can be bound more than once
259: in a single matching
260: The typical case is:
261: (e1 as x) (e2 as x)
262:
263: *)
264:
265: let rec do_find_double = function
266: | Characters _|Epsilon|Eof -> StringSet.empty, StringSet.empty
267: | Bind (e,x) ->
268: let dbl,all = do_find_double e in
269: (if StringSet.mem x all then
270: StringSet.add x dbl
271: else
272: dbl),
273: StringSet.add x all
274: | Sequence (e1,e2) ->
275: let dbl1, all1 = do_find_double e1
276: and dbl2, all2 = do_find_double e2 in
277: StringSet.union
278: (StringSet.inter all1 all2)
279: (StringSet.union dbl1 dbl2),
280: StringSet.union all1 all2
281: | Alternative (e1,e2) ->
282: let dbl1, all1 = do_find_double e1
283: and dbl2, all2 = do_find_double e2 in
284: StringSet.union dbl1 dbl2,
285: StringSet.union all1 all2
286: | Repetition e ->
287: let r = find_all_vars e in
288: r,r
289:
290: let find_double e = do_find_double e
291:
292: (*
293: Type of variables:
294: A variable is bound to a char when all its occurences
295: bind a pattern of length 1.
296: The typical case is:
297: (_ as x) -> char
298: *)
299:
300: let add_some x = function
301: | Some i -> Some (x+i)
302: | None -> None
303:
304: let add_some_some x y = match x,y with
305: | Some i, Some j -> Some (i+j)
306: | _,_ -> None
307:
308: let rec do_find_chars sz = function
309: | Epsilon|Eof -> StringSet.empty, StringSet.empty, sz
310: | Characters _ -> StringSet.empty, StringSet.empty, add_some 1 sz
311: | Bind (e,x) ->
312: let c,s,e_sz = do_find_chars (Some 0) e in
313: begin match e_sz with
314: | Some 1 ->
315: StringSet.add x c,s,add_some 1 sz
316: | _ ->
317: c, StringSet.add x s, add_some_some sz e_sz
318: end
319: | Sequence (e1,e2) ->
320: let c1,s1,sz1 = do_find_chars sz e1 in
321: let c2,s2,sz2 = do_find_chars sz1 e2 in
322: StringSet.union c1 c2,
323: StringSet.union s1 s2,
324: sz2
325: | Alternative (e1,e2) ->
326: let c1,s1,sz1 = do_find_chars sz e1
327: and c2,s2,sz2 = do_find_chars sz e2 in
328: StringSet.union c1 c2,
329: StringSet.union s1 s2,
330: (if sz1 = sz2 then sz1 else None)
331: | Repetition e -> do_find_chars None e
332:
333:
334:
335: let find_chars e =
336: let c,s,_ = do_find_chars (Some 0) e in
337: StringSet.diff c s
338:
339: (*******************************)
340: (* From shallow to deep syntax *)
341: (*******************************)
342:
343: let chars = ref ([] : Inria_cset.t list)
344: let chars_count = ref 0
345:
346:
347: let rec encode_regexp char_vars act = function
348: Epsilon -> Empty
349: | Characters cl ->
350: let n = !chars_count in
351: chars := cl :: !chars;
352: incr chars_count;
353: print_endline ("Position " ^ string_of_int n ^ "->" ^ Inria_cset.string_of_characters cl);
354: Chars(n,false)
355: | Eof ->
356: let n = !chars_count in
357: chars := Inria_cset.eof :: !chars;
358: incr chars_count;
359: Chars(n,true)
360: | Sequence(r1,r2) ->
361: let r1 = encode_regexp char_vars act r1 in
362: let r2 = encode_regexp char_vars act r2 in
363: Seq (r1, r2)
364: | Alternative(r1,r2) ->
365: let r1 = encode_regexp char_vars act r1 in
366: let r2 = encode_regexp char_vars act r2 in
367: Alt(r1, r2)
368: | Repetition r ->
369: let r = encode_regexp char_vars act r in
370: Star r
371: | Bind (r,x) ->
372: let r = encode_regexp char_vars act r in
373: if StringSet.mem x char_vars then
374: Seq (Tag {id=x ; start=true ; action=act},r)
375: else
376: Seq (Tag {id=x ; start=true ; action=act},
377: Seq (r, Tag {id=x ; start=false ; action=act}))
378:
379:
380: (* Optimisation,
381: Static optimization :
382: Replace tags by offsets relative to the beginning
383: or end of matched string.
384: Dynamic optimization:
385: Replace some non-optional, non-double tags by offsets w.r.t
386: a previous similar tag.
387: *)
388:
389: let incr_pos = function
390: | None -> None
391: | Some i -> Some (i+1)
392:
393: let decr_pos = function
394: | None -> None
395: | Some i -> Some (i-1)
396:
397:
398: let opt = true
399:
400: let mk_seq r1 r2 = match r1,r2 with
401: | Empty,_ -> r2
402: | _,Empty -> r1
403: | _,_ -> Seq (r1,r2)
404:
405: let add_pos p i = match p with
406: | Some (Sum (a,n)) -> Some (Sum (a,n+i))
407: | None -> None
408:
409: let opt_regexp all_vars char_vars optional_vars double_vars r =
410:
411: (* From removed tags to their addresses *)
412: let env = Hashtbl.create 17 in
413:
414: (* First static optimizations, from start position *)
415: let rec size_forward pos = function
416: | Empty|Chars (_,true)|Tag _ -> Some pos
417: | Chars (_,false) -> Some (pos+1)
418: | Seq (r1,r2) ->
419: begin match size_forward pos r1 with
420: | None -> None
421: | Some pos -> size_forward pos r2
422: end
423: | Alt (r1,r2) ->
424: let pos1 = size_forward pos r1
425: and pos2 = size_forward pos r2 in
426: if pos1=pos2 then pos1 else None
427: | Star _ -> None
428: | Action _ -> assert false in
429:
430: let rec simple_forward pos r = match r with
431: | Tag n ->
432: if StringSet.mem n.id double_vars then
433: r,Some pos
434: else begin
435: Hashtbl.add env (n.id,n.start) (Sum (Start, pos)) ;
436: Empty,Some pos
437: end
438: | Empty -> r, Some pos
439: | Chars (_,is_eof) ->
440: r,Some (if is_eof then pos else pos+1)
441: | Seq (r1,r2) ->
442: let r1,pos = simple_forward pos r1 in
443: begin match pos with
444: | None -> mk_seq r1 r2,None
445: | Some pos ->
446: let r2,pos = simple_forward pos r2 in
447: mk_seq r1 r2,pos
448: end
449: | Alt (r1,r2) ->
450: let pos1 = size_forward pos r1
451: and pos2 = size_forward pos r2 in
452: r,(if pos1=pos2 then pos1 else None)
453: | Star _ -> r,None
454: | Action _ -> assert false in
455:
456: (* Then static optimizations, from end position *)
457: let rec size_backward pos = function
458: | Empty|Chars (_,true)|Tag _ -> Some pos
459: | Chars (_,false) -> Some (pos-1)
460: | Seq (r1,r2) ->
461: begin match size_backward pos r2 with
462: | None -> None
463: | Some pos -> size_backward pos r1
464: end
465: | Alt (r1,r2) ->
466: let pos1 = size_backward pos r1
467: and pos2 = size_backward pos r2 in
468: if pos1=pos2 then pos1 else None
469: | Star _ -> None
470: | Action _ -> assert false in
471:
472:
473: let rec simple_backward pos r = match r with
474: | Tag n ->
475: if StringSet.mem n.id double_vars then
476: r,Some pos
477: else begin
478: Hashtbl.add env (n.id,n.start) (Sum (End, pos)) ;
479: Empty,Some pos
480: end
481: | Empty -> r,Some pos
482: | Chars (_,is_eof) ->
483: r,Some (if is_eof then pos else pos-1)
484: | Seq (r1,r2) ->
485: let r2,pos = simple_backward pos r2 in
486: begin match pos with
487: | None -> mk_seq r1 r2,None
488: | Some pos ->
489: let r1,pos = simple_backward pos r1 in
490: mk_seq r1 r2,pos
491: end
492: | Alt (r1,r2) ->
493: let pos1 = size_backward pos r1
494: and pos2 = size_backward pos r2 in
495: r,(if pos1=pos2 then pos1 else None)
496: | Star _ -> r,None
497: | Action _ -> assert false in
498:
499: let r =
500: if opt then
501: let r,_ = simple_forward 0 r in
502: let r,_ = simple_backward 0 r in
503: r
504: else
505: r in
506:
507: let loc_count = ref 0 in
508: let get_tag_addr t =
509: try
510: Hashtbl.find env t
511: with
512: | Not_found ->
513: let n = !loc_count in
514: incr loc_count ;
515: Hashtbl.add env t (Sum (Mem n,0)) ;
516: Sum (Mem n,0) in
517:
518: let rec alloc_exp pos r = match r with
519: | Tag n ->
520: if StringSet.mem n.id double_vars then
521: r,pos
522: else begin match pos with
523: | Some a ->
524: Hashtbl.add env (n.id,n.start) a ;
525: Empty,pos
526: | None ->
527: let a = get_tag_addr (n.id,n.start) in
528: r,Some a
529: end
530:
531: | Empty -> r,pos
532: | Chars (_,is_eof) -> r,(if is_eof then pos else add_pos pos 1)
533: | Seq (r1,r2) ->
534: let r1,pos = alloc_exp pos r1 in
535: let r2,pos = alloc_exp pos r2 in
536: mk_seq r1 r2,pos
537: | Alt (_,_) ->
538: let off = size_forward 0 r in
539: begin match off with
540: | Some i -> r,add_pos pos i
541: | None -> r,None
542: end
543: | Star _ -> r,None
544: | Action _ -> assert false in
545:
546: let r,_ = alloc_exp None r in
547: let m =
548: StringSet.fold
549: (fun x r ->
550: let v =
551: if StringSet.mem x char_vars then
552: Ident_char
553: (StringSet.mem x optional_vars, get_tag_addr (x,true))
554: else
555: Ident_string
556: (StringSet.mem x optional_vars,
557: get_tag_addr (x,true),
558: get_tag_addr (x,false)) in
559: (x,v)::r)
560: all_vars [] in
561: m,r, !loc_count
562:
563:
564:
565: let encode_casedef casedef =
566: let r =
567: List.fold_left
568: (fun (reg,actions,count,ntags) (expr, act) ->
569: let expr = remove_nested_as expr in
570: let char_vars = find_chars expr in
571: let r = encode_regexp char_vars count expr
572: and opt_vars = find_optional expr
573: and double_vars,all_vars = find_double expr in
574: let m,r,loc_ntags =
575: opt_regexp all_vars char_vars opt_vars double_vars r in
576: Alt(reg, Seq(r, Action count)),
577: (count, m ,act) :: actions,
578: (succ count),
579: max loc_ntags ntags)
580: (Empty, [], 0, 0)
581: casedef in
582: r
583:
584: let encode_lexdef def =
585: chars := [];
586: chars_count := 0;
587: let entry_list =
588: List.map
589: (fun {name=entry_name ; args=args ; shortest=shortest ; clauses= casedef} ->
590: let (re,actions,_,ntags) = encode_casedef casedef in
591: { lex_name = entry_name;
592: lex_regexp = re;
593: lex_mem_tags = ntags ;
594: lex_actions = List.rev actions },args,shortest)
595: def in
596: let chr = Array.of_list (List.rev !chars) in
597: chars := [];
598: (chr, entry_list)
599:
600: (* To generate directly a NFA from a regular expression.
601: Confer Aho-Sethi-Ullman, dragon book, chap. 3
602: Extension to tagged automata.
603: Confer
604: Ville Larikari
605: ``NFAs with Tagged Transitions, their Conversion to Deterministic
606: Automata and Application to Regular Expressions''.
607: Symposium on String Processing and Information Retrieval (SPIRE 2000),
608: http://kouli.iki.fi/~vlaurika/spire2000-tnfa.ps
609: (See also)
610: http://kouli.iki.fi/~vlaurika/regex-submatch.ps.gz
611: *)
612:
613: type t_transition =
614: OnChars of int
615: | ToAction of int
616:
617: type transition = t_transition * Tags.t
618:
619: let compare_trans (t1,tags1) (t2,tags2) =
620: match Pervasives.compare t1 t2 with
621: | 0 -> Tags.compare tags1 tags2
622: | r -> r
623:
624:
625: module TransSet =
626: Set.Make(struct type t = transition let compare = compare end)
627:
628: let rec nullable = function
629: | Empty|Tag _ -> true
630: | Chars (_,_)|Action _ -> false
631: | Seq(r1,r2) -> nullable r1 && nullable r2
632: | Alt(r1,r2) -> nullable r1 || nullable r2
633: | Star r -> true
634:
635: let rec emptymatch = function
636: | Empty | Chars (_,_) | Action _ -> Tags.empty
637: | Tag t -> Tags.add t Tags.empty
638: | Seq (r1,r2) -> Tags.union (emptymatch r1) (emptymatch r2)
639: | Alt(r1,r2) ->
640: if nullable r1 then
641: emptymatch r1
642: else
643: emptymatch r2
644: | Star r ->
645: if nullable r then
646: emptymatch r
647: else
648: Tags.empty
649:
650: let addtags transs tags =
651: TransSet.fold
652: (fun (t,tags_t) r -> TransSet.add (t, Tags.union tags tags_t) r)
653: transs TransSet.empty
654:
655:
656: let rec firstpos = function
657: Empty|Tag _ -> TransSet.empty
658: | Chars (pos,_) -> TransSet.add (OnChars pos,Tags.empty) TransSet.empty
659: | Action act -> TransSet.add (ToAction act,Tags.empty) TransSet.empty
660: | Seq(r1,r2) ->
661: if nullable r1 then
662: TransSet.union (firstpos r1) (addtags (firstpos r2) (emptymatch r1))
663: else
664: firstpos r1
665: | Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
666: | Star r -> firstpos r
667:
668:
669: (* Berry-sethi followpos *)
670: let followpos size entry_list =
671: let v = Array.create size TransSet.empty in
672: let rec fill s = function
673: | Empty|Action _|Tag _ -> ()
674: | Chars (n,_) -> v.(n) <- s
675: | Alt (r1,r2) ->
676: fill s r1 ; fill s r2
677: | Seq (r1,r2) ->
678: fill
679: (if nullable r2 then
680: TransSet.union (firstpos r2) (addtags s (emptymatch r2))
681: else
682: (firstpos r2))
683: r1 ;
684: fill s r2
685: | Star r ->
686: fill (TransSet.union (firstpos r) s) r in
687: List.iter (fun (entry,_,_) -> fill TransSet.empty entry.lex_regexp) entry_list ;
688: v
689:
690: (************************)
691: (* The algorithm itself *)
692: (************************)
693:
694: let no_action = max_int
695:
696: module StateSet =
697: Set.Make (struct type t = t_transition let compare = Pervasives.compare end)
698:
699:
700: module MemMap =
701: Map.Make (struct type t = int let compare = Pervasives.compare end)
702:
703: type 'a dfa_state =
704: {final : int * ('a * int TagMap.t) ;
705: others : ('a * int TagMap.t) MemMap.t}
706:
707: (* *)
708: let dtag oc t =
709: fprintf oc "%s<%s>" t.id (if t.start then "s" else "e")
710:
711: let dmem_map dp ds m =
712: MemMap.iter
713: (fun k x ->
714: eprintf "%d -> " k ; dp x ; ds ())
715: m
716:
717: and dtag_map dp ds m =
718: TagMap.iter
719: (fun t x ->
720: dtag stderr t ; eprintf " -> " ; dp x ; ds ())
721: m
722:
723: let dstate {final=(act,(_,m)) ; others=o} =
724: if act <> no_action then begin
725: eprintf "final=%d " act ;
726: dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m ;
727: prerr_endline ""
728: end ;
729: dmem_map
730: (fun (_,m) ->
731: dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m)
732: (fun () -> prerr_endline "")
733: o
734: (* *)
735:
736: let dfa_state_empty =
737: {final=(no_action, (max_int,TagMap.empty)) ;
738: others=MemMap.empty}
739:
740: and dfa_state_is_empty {final=(act,_) ; others=o} =
741: act = no_action &&
742: o = MemMap.empty
743:
744:
745: (* A key is an abstraction on a dfa state,
746: two states with the same key can be made the same by
747: copying some memory cells into others *)
748:
749:
750: module StateSetSet =
751: Set.Make (struct type t = StateSet.t let compare = StateSet.compare end)
752:
753: type t_equiv = {tag:tag_info ; equiv:StateSetSet.t}
754:
755: module MemKey =
756: Set.Make
757: (struct
758: type t = t_equiv
759:
760: let compare e1 e2 = match Pervasives.compare e1.tag e2.tag with
761: | 0 -> StateSetSet.compare e1.equiv e2.equiv
762: | r -> r
763: end)
764:
765: type dfa_key = {kstate : StateSet.t ; kmem : MemKey.t}
766:
767: (* Map a state to its key *)
768: let env_to_class m =
769: let env1 =
770: MemMap.fold
771: (fun _ (tag,s) r ->
772: try
773: let ss = TagMap.find tag r in
774: let r = TagMap.remove tag r in
775: TagMap.add tag (StateSetSet.add s ss) r
776: with
777: | Not_found ->
778: TagMap.add tag (StateSetSet.add s StateSetSet.empty) r)
779: m TagMap.empty in
780: TagMap.fold
781: (fun tag ss r -> MemKey.add {tag=tag ; equiv=ss} r)
782: env1 MemKey.empty
783:
784:
785: (* trans is nfa_state, m is associated memory map *)
786: let inverse_mem_map trans m r =
787: TagMap.fold
788: (fun tag addr r ->
789: try
790: let otag,s = MemMap.find addr r in
791: assert (tag = otag) ;
792: let r = MemMap.remove addr r in
793: MemMap.add addr (tag,StateSet.add trans s) r
794: with
795: | Not_found ->
796: MemMap.add addr (tag,StateSet.add trans StateSet.empty) r)
797: m r
798:
799: let inverse_mem_map_other n (_,m) r = inverse_mem_map (OnChars n) m r
800:
801: let get_key {final=(act,(_,m_act)) ; others=o} =
802: let env =
803: MemMap.fold inverse_mem_map_other
804: o
805: (if act = no_action then MemMap.empty
806: else inverse_mem_map (ToAction act) m_act MemMap.empty) in
807: let state_key =
808: MemMap.fold (fun n _ r -> StateSet.add (OnChars n) r) o
809: (if act=no_action then StateSet.empty
810: else StateSet.add (ToAction act) StateSet.empty) in
811: let mem_key = env_to_class env in
812: {kstate = state_key ; kmem = mem_key}
813:
814:
815: let key_compare k1 k2 = match StateSet.compare k1.kstate k2.kstate with
816: | 0 -> MemKey.compare k1.kmem k2.kmem
817: | r -> r
818:
819: (* Association dfa_state -> state_num *)
820:
821: module StateMap =
822: Map.Make(struct type t = dfa_key let compare = key_compare end)
823:
824: let state_map = ref (StateMap.empty : int StateMap.t)
825: let todo = Stack.create()
826: let next_state_num = ref 0
827: let next_mem_cell = ref 0
828: let temp_pending = ref false
829: let tag_cells = Hashtbl.create 17
830: let state_table = Inria_table.create dfa_state_empty
831:
832:
833: let reset_state_mem () =
834: state_map := StateMap.empty;
835: Stack.clear todo;
836: next_state_num := 0 ;
837: let _ = Inria_table.trim state_table in
838: ()
839:
840: (* Allocation of memory cells *)
841: let reset_cell_mem ntags =
842: next_mem_cell := ntags ;
843: Hashtbl.clear tag_cells ;
844: temp_pending := false
845:
846: let do_alloc_temp () =
847: temp_pending := true ;
848: let n = !next_mem_cell in
849: n
850:
851: let do_alloc_cell used t =
852: let available =
853: try Hashtbl.find tag_cells t with Not_found -> Ints.empty in
854: try
855: Ints.choose (Ints.diff available used)
856: with
857: | Not_found ->
858: temp_pending := false ;
859: let n = !next_mem_cell in
860: if n >= 255 then raise Memory_overflow ;
861: Hashtbl.replace tag_cells t (Ints.add n available) ;
862: incr next_mem_cell ;
863: n
864:
865: let is_old_addr a = a >= 0
866: and is_new_addr a = a < 0
867:
868: let old_in_map m r =
869: TagMap.fold
870: (fun _ addr r ->
871: if is_old_addr addr then
872: Ints.add addr r
873: else
874: r)
875: m r
876:
877: let alloc_map used m mvs =
878: TagMap.fold
879: (fun tag a (r,mvs) ->
880: let a,mvs =
881: if is_new_addr a then
882: let a = do_alloc_cell used tag in
883: a,Ints.add a mvs
884: else a,mvs in
885: TagMap.add tag a r,mvs)
886: m (TagMap.empty,mvs)
887:
888: let create_new_state {final=(act,(_,m_act)) ; others=o} =
889: let used =
890: MemMap.fold (fun _ (_,m) r -> old_in_map m r)
891: o (old_in_map m_act Ints.empty) in
892:
893: let new_m_act,mvs = alloc_map used m_act Ints.empty in
894: let new_o,mvs =
895: MemMap.fold (fun k (x,m) (r,mvs) ->
896: let m,mvs = alloc_map used m mvs in
897: MemMap.add k (x,m) r,mvs)
898: o (MemMap.empty,mvs) in
899: {final=(act,(0,new_m_act)) ; others=new_o},
900: Ints.fold (fun x r -> Set x::r) mvs []
901:
902: type new_addr_gen = {mutable count : int ; mutable env : int TagMap.t}
903:
904: let create_new_addr_gen () = {count = -1 ; env = TagMap.empty}
905:
906: let alloc_new_addr tag r =
907: try
908: TagMap.find tag r.env
909: with
910: | Not_found ->
911: let a = r.count in
912: r.count <- a-1 ;
913: r.env <- TagMap.add tag a r.env ;
914: a
915:
916:
917: let create_mem_map tags gen =
918: Tags.fold
919: (fun tag r -> TagMap.add tag (alloc_new_addr tag gen) r)
920: tags TagMap.empty
921:
922: let create_init_state pos =
923: let gen = create_new_addr_gen () in
924: let st =
925: TransSet.fold
926: (fun (t,tags) st ->
927: match t with
928: | ToAction n ->
929: let on,otags = st.final in
930: if n < on then
931: {st with final = (n, (0,create_mem_map tags gen))}
932: else
933: st
934: | OnChars n ->
935: try
936: let _ = MemMap.find n st.others in assert false
937: with
938: | Not_found ->
939: {st with others =
940: MemMap.add n (0,create_mem_map tags gen) st.others})
941: pos dfa_state_empty in
942: st
943:
944:
945: let get_map t st = match t with
946: | ToAction _ -> let _,(_,m) = st.final in m
947: | OnChars n ->
948: let (_,m) = MemMap.find n st.others in
949: m
950:
951: let dest = function | Copy (d,_) | Set d -> d
952: and orig = function | Copy (_,o) -> o | Set _ -> -1
953:
954: let pmv oc mv = fprintf oc "%d <- %d" (dest mv) (orig mv)
955: let pmvs oc mvs =
956: List.iter (fun mv -> fprintf oc "%a " pmv mv) mvs ;
957: output_char oc '\n' ; flush oc
958:
959:
960: (* Topological sort << a la louche >> *)
961: let sort_mvs mvs =
962: let rec do_rec r mvs = match mvs with
963: | [] -> r
964: | _ ->
965: let dests =
966: List.fold_left
967: (fun r mv -> Ints.add (dest mv) r)
968: Ints.empty mvs in
969: let rem,here =
970: List.partition
971: (fun mv -> Ints.mem (orig mv) dests)
972: mvs in
973: match here with
974: | [] ->
975: begin match rem with
976: | Copy (d,_)::_ ->
977: let d' = do_alloc_temp () in
978: Copy (d',d)::
979: do_rec r
980: (List.map
981: (fun mv ->
982: if orig mv = d then
983: Copy (dest mv,d')
984: else
985: mv)
986: rem)
987: | _ -> assert false
988: end
989: | _ -> do_rec (here@r) rem in
990: do_rec [] mvs
991:
992: let move_to mem_key src tgt =
993: let mvs =
994: MemKey.fold
995: (fun {tag=tag ; equiv=m} r ->
996: StateSetSet.fold
997: (fun s r ->
998: try
999: let t = StateSet.choose s in
1000: let src = TagMap.find tag (get_map t src)
1001: and tgt = TagMap.find tag (get_map t tgt) in
1002: if src <> tgt then begin
1003: if is_new_addr src then
1004: Set tgt::r
1005: else
1006: Copy (tgt, src)::r
1007: end else
1008: r
1009: with
1010: | Not_found -> assert false)
1011: m r)
1012: mem_key [] in
1013: (* Moves are topologically sorted *)
1014: sort_mvs mvs
1015:
1016:
1017: let get_state st =
1018: let key = get_key st in
1019: try
1020: let num = StateMap.find key !state_map in
1021: num,move_to key.kmem st (Inria_table.get state_table num)
1022: with Not_found ->
1023: let num = !next_state_num in
1024: incr next_state_num;
1025: let st,mvs = create_new_state st in
1026: Inria_table.emit state_table st ;
1027: state_map := StateMap.add key num !state_map;
1028: Stack.push (st, num) todo;
1029: num,mvs
1030:
1031: let map_on_all_states f old_res =
1032: let res = ref old_res in
1033: begin try
1034: while true do
1035: let (st, i) = Stack.pop todo in
1036: let r = f st in
1037: res := (r, i) :: !res
1038: done
1039: with Stack.Empty -> ()
1040: end;
1041: !res
1042:
1043: let goto_state st =
1044: if
1045: dfa_state_is_empty st
1046: then
1047: Backtrack,[]
1048: else
1049: let n,moves = get_state st in
1050: Goto n,moves
1051:
1052: (****************************)
1053: (* compute reachable states *)
1054: (****************************)
1055:
1056: let add_tags_to_map gen tags m =
1057: Tags.fold
1058: (fun tag m ->
1059: let m = TagMap.remove tag m in
1060: TagMap.add tag (alloc_new_addr tag gen) m)
1061: tags m
1062:
1063: let apply_transition gen r pri m = function
1064: | ToAction n,tags ->
1065: let on,(opri,_) = r.final in
1066: if n < on || (on=n && pri < opri) then
1067: let m = add_tags_to_map gen tags m in
1068: {r with final=n,(pri,m)}
1069: else r
1070: | OnChars n,tags ->
1071: try
1072: let (opri,_) = MemMap.find n r.others in
1073: if pri < opri then
1074: let m = add_tags_to_map gen tags m in
1075: {r with others=MemMap.add n (pri,m) (MemMap.remove n r.others)}
1076: else
1077: r
1078: with
1079: | Not_found ->
1080: let m = add_tags_to_map gen tags m in
1081: {r with others=MemMap.add n (pri,m) r.others}
1082:
1083: (* add transitions ts to new state r
1084: transitions in ts start from state pri and memory map m
1085: *)
1086: let apply_transitions gen r pri m ts =
1087: TransSet.fold
1088: (fun t r -> apply_transition gen r pri m t)
1089: ts r
1090:
1091:
1092: (* For a given nfa_state pos, refine char partition *)
1093: let rec split_env gen follow pos m s = function
1094: | [] -> (* Can occur ! because of non-matching regexp ([^'\000'-'\255']) *)
1095: []
1096: | (s1,st1) as p::rem ->
1097: let here = Inria_cset.inter s s1 in
1098: if Inria_cset.is_empty here then
1099: p::split_env gen follow pos m s rem
1100: else
1101: let rest = Inria_cset.diff s here in
1102: let rem =
1103: if Inria_cset.is_empty rest then
1104: rem
1105: else
1106: split_env gen follow pos m rest rem
1107: and new_st = apply_transitions gen st1 pos m follow in
1108: let stay = Inria_cset.diff s1 here in
1109: if Inria_cset.is_empty stay then
1110: (here, new_st)::rem
1111: else
1112: (stay, st1)::(here, new_st)::rem
1113:
1114:
1115: (* For all nfa_state pos in a dfa state st *)
1116: let comp_shift gen chars follow st =
1117: MemMap.fold
1118: (fun pos (_,m) env -> split_env gen follow.(pos) pos m chars.(pos) env)
1119: st [Inria_cset.all_chars_eof,dfa_state_empty]
1120:
1121:
1122: let reachs chars follow st =
1123: let gen = create_new_addr_gen () in
1124: (* build a association list (char set -> new state) *)
1125: let env = comp_shift gen chars follow st in
1126: (* change it into (char set -> new state_num) *)
1127: let env =
1128: List.map
1129: (fun (s,dfa_state) -> s,goto_state dfa_state) env in
1130: (* finally build the char indexed array -> new state num *)
1131: let shift = Inria_cset.env_to_array env in
1132: shift
1133:
1134:
1135: let get_tag_mem n env t =
1136: try
1137: TagMap.find t env.(n)
1138: with
1139: | Not_found -> assert false
1140:
1141: let do_tag_actions n env m =
1142:
1143: let used,r =
1144: TagMap.fold (fun t m (used,r) ->
1145: let a = get_tag_mem n env t in
1146: Ints.add a used,SetTag (a,m)::r) m (Ints.empty,[]) in
1147: let _,r =
1148: TagMap.fold
1149: (fun tag m (used,r) ->
1150: if not (Ints.mem m used) && tag.start then
1151: Ints.add m used, EraseTag m::r
1152: else
1153: used,r)
1154: env.(n) (used,r) in
1155: r
1156:
1157:
1158: let translate_state shortest_match tags chars follow st =
1159: let (n,(_,m)) = st.final in
1160: if MemMap.empty = st.others then
1161: Perform (n,do_tag_actions n tags m)
1162: else if shortest_match then begin
1163: if n=no_action then
1164: Shift (No_remember,reachs chars follow st.others)
1165: else
1166: Perform(n, do_tag_actions n tags m)
1167: end else begin
1168: Shift (
1169: (if n = no_action then
1170: No_remember
1171: else
1172: Remember (n,do_tag_actions n tags m)),
1173: reachs chars follow st.others)
1174: end
1175:
1176: (* *)
1177: let dtags chan tags =
1178: Tags.iter
1179: (fun t -> fprintf chan " %a" dtag t)
1180: tags
1181:
1182: let dtransset s =
1183: TransSet.iter
1184: (fun trans -> match trans with
1185: | OnChars i,tags ->
1186: eprintf " (-> %d,%a)" i dtags tags
1187: | ToAction i,tags ->
1188: eprintf " ([%d],%a)" i dtags tags)
1189: s
1190:
1191: let dfollow t =
1192: eprintf "follow=[" ;
1193: for i = 0 to Array.length t-1 do
1194: eprintf "\n%d:" i ;
1195: dtransset t.(i)
1196: done ;
1197: prerr_endline "]"
1198: (* *)
1199:
1200: let make_tag_entry id start act a r = match a with
1201: | Sum (Mem m,0) ->
1202: TagMap.add {id=id ; start=start ; action=act} m r
1203: | _ -> r
1204:
1205: let extract_tags l =
1206: let envs = Array.create (List.length l) TagMap.empty in
1207: List.iter
1208: (fun (act,m,_) ->
1209: envs.(act) <-
1210: List.fold_right
1211: (fun (x,v) r -> match v with
1212: | Ident_char (_,t) -> make_tag_entry x true act t r
1213: | Ident_string (_,t1,t2) ->
1214: make_tag_entry x true act t1
1215: (make_tag_entry x false act t2 r))
1216: m TagMap.empty)
1217: l ;
1218: envs
1219:
1220:
1221: let make_dfa lexdef =
1222: let (chars, entry_list) = encode_lexdef lexdef in
1223: let follow = followpos (Array.length chars) entry_list in
1224: (* *)
1225: dfollow follow ;
1226: (* *)
1227: reset_state_mem () ;
1228: let r_states = ref [] in
1229: let initial_states =
1230: List.map
1231: (fun (le,args,shortest) ->
1232: let tags = extract_tags le.lex_actions in
1233: reset_cell_mem le.lex_mem_tags ;
1234: let pos_set = firstpos le.lex_regexp in
1235: (* *)
1236: prerr_string "trans={" ; dtransset pos_set ; prerr_endline "}" ;
1237: (* *)
1238: let init_state = create_init_state pos_set in
1239: let init_num = get_state init_state in
1240: r_states :=
1241: map_on_all_states
1242: (translate_state shortest tags chars follow) !r_states ;
1243: { auto_name = le.lex_name;
1244: auto_args = args ;
1245: auto_mem_size =
1246: (if !temp_pending then !next_mem_cell+1 else !next_mem_cell) ;
1247: auto_initial_state = init_num ;
1248: auto_actions = le.lex_actions })
1249: entry_list in
1250: let states = !r_states in
1251: (* *)
1252: prerr_endline "** states **" ;
1253: for i = 0 to !next_state_num-1 do
1254: eprintf "+++ %d +++\n" i ;
1255: dstate (Inria_table.get state_table i) ;
1256: prerr_endline ""
1257: done ;
1258: eprintf "%d states\n" !next_state_num ;
1259: (* *)
1260: let actions = Array.create !next_state_num (Perform (0,[])) in
1261: List.iter (fun (act, i) -> actions.(i) <- act ) states;
1262:
1263: print_endline "Actions ..";
1264: for i = 0 to !next_state_num - 1 do
1265: let act = actions.(i) in
1266: print_endline
1267: ("State " ^ string_of_int i ^ " has action " ^ string_of_automata act)
1268: done;
1269: reset_state_mem () ;
1270: reset_cell_mem 0 ;
1271:
1272: print_endline "Initial states ..";
1273: List.iter
1274: (fun ae ->
1275: print_endline "--- automata entry -- ";
1276: print_endline (string_of_automata_entry ae);
1277: )
1278: initial_states
1279: ;
1280: (initial_states, actions)