--- /usr/local/lib/erlang/lib/stdlib-1.9.2/src/regexp.erl Sat Feb 24 14:35:17 2001 +++ gregexp.erl Thu Mar 22 12:43:51 2001 @@ -14,8 +14,18 @@ %% AB. All Rights Reserved.'' %% %% $Id$ + +%% Submatch extraction (C) March 2001 pascal.brisset@cellicium.com . +%% This module extends otp_src_R7B-1/libstdlib-1.9.2/src/regexp.erl +%% with the syntax "\\(" and "\\)". This makes it possible to extract +%% subgroups of a match. For example: +%% URL="\\(.+\\)://\\(.+\\)\\(/.+\\)(\\?\\(.*\\)(&\\(.*\\))*)?", +%% gregexp:groups("http://localhost:81/script?arg&arg2&arg3", URL). +%% {match,["http","localhost:81","/script","arg","arg2","arg3"]} %% --module(regexp). +%% Note that the character '(' can be matched with the regexp "[(]". + +-module(gregexp). %% This module provides a basic set of regular expression functions %% for strings. The functions provided are taken from AWK. @@ -26,11 +36,13 @@ -export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]). -export([sub/3,gsub/3,split/2]). +-export([groups/2]). -import(string, [substr/2,substr/3]). -import(lists, [reverse/1]). %% -type matchres() = {match,Start,Length} | nomatch | {error,E}. +%% -type gmatchres() = {match,[string()]} | nomatch | {error,E}. %% -type subres() = {ok,RepString,RepCount} | {error,E}. %% -type splitres() = {ok,[SubString]} | {error,E}. @@ -50,6 +62,7 @@ %% reg3 -> reg3 "?" : {optional,'$1'}. %% reg3 -> reg4 : '$1'. %% reg4 -> "(" reg ")" : '$2'. +%% reg4 -> "\\(" reg "\\)" : '$2'. %% reg4 -> "\\" char : '$2'. %% reg4 -> "^" : bos. %% reg4 -> "$" : eos. @@ -85,6 +98,7 @@ {L,S1} = reg3(S0), reg2p(S1, L). +reg2p([$\\,$)|_]=S, L) -> {L,S}; reg2p([C|S0], L) when C /= $|, C /= $) -> {R,S1} = reg3([C|S0]), reg2p(S1, {concat,L,R}); @@ -110,6 +124,11 @@ {R,[$)|S1]} -> {R,S1}; {R,S} -> throw({error,{unterminated,"("}}) end; +reg4([$\\,$(|S0]) -> + case reg(S0) of + {R,[$\\,$)|S1]} -> {{group,R},S1}; + {R,S} -> throw({error,{unterminated,"\\("}}) + end; reg4([$\\,O1,O2,O3|S]) when O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> {(O1*8 + O2)*8 + O3 - 73*$0,S}; @@ -175,7 +194,7 @@ % char_string(S0, {concat,L,C}); %char_string(S, L) -> {L,S}. -%% -deftype re_app_res() = {match,RestPos,Rest} | nomatch. +%% -deftype re_app_res() = {match,RestPos,Rest,Groups} | nomatch. %% re_apply(String, StartPos, RegExp) -> re_app_res(). %% @@ -207,6 +226,15 @@ re_apply(CE, [{kclosure,CE}|More], S, P)); re_apply({pclosure,CE}, More, S, P) -> re_apply(CE, [{kclosure,CE}|More], S, P); +re_apply({group,RE}, More, S, P) -> + %% Insert a pseudo-regexp so that we can record the group when we + %% reach its end. + re_apply(RE, [{endgroup,P}|More], S, P); +re_apply({endgroup,St}, More, S, P) -> + case re_apply_more(More, S, P) of + nomatch -> nomatch; + {match, RP, R, G} -> {match, RP, R, [{St, P-St}|G]} + end; re_apply({optional,CE}, More, S, P) -> re_apply_or(re_apply_more(More, S, P), re_apply(CE, More, S, P)); @@ -230,7 +258,7 @@ %% re_apply_more([RegExp], String, Length) -> re_app_res(). re_apply_more([RE|More], S, P) -> re_apply(RE, More, S, P); -re_apply_more([], S, P) -> {match,P,S}. +re_apply_more([], S, P) -> {match,P,S,[]}. %% in_char_class(Char, Class) -> bool(). @@ -243,8 +271,8 @@ %% If we want the best match then choose the longest match, else just %% choose one by trying sequentially. -re_apply_or({match,P1,S1}, {match,P2,S2}) when P1 >= P2 -> {match,P1,S1}; -re_apply_or({match,P1,S1}, {match,P2,S2}) -> {match,P2,S2}; +re_apply_or({match,P1,S1,G1},{match,P2,S2,G2}) when P1>=P2 -> {match,P1,S1,G1}; +re_apply_or({match,P1,S1,G1}, {match,P2,S2,G2}) -> {match,P2,S2,G2}; re_apply_or(nomatch, R2) -> R2; re_apply_or(R1, nomatch) -> R1. @@ -357,7 +385,7 @@ first_match(RE, S, St) when S /= [] -> case re_apply(S, St, RE) of - {match,P,Rest} -> {St,P-St}; + {match,P,Rest,_Groups} -> {St,P-St}; nomatch -> first_match(RE, tl(S), St+1) end; first_match(RE, [], St) -> nomatch. @@ -446,10 +474,24 @@ split_apply([], P, RE, T, Sub) -> [reverse(Sub)]; split_apply(S, P, RE, T, Sub) -> case re_apply(S, P, RE) of - {match,P,Rest} -> + {match,P,Rest,_Groups} -> split_apply(tl(S), P+1, RE, T, [hd(S)|Sub]); - {match,P1,Rest} -> + {match,P1,Rest,_Groups} -> [reverse(Sub)|split_apply(Rest, P1, RE, T, [])]; nomatch -> split_apply(tl(S), P+1, RE, T, [hd(S)|Sub]) + end. + +%%%% + +groups(S, RegExp) when list(RegExp) -> + {ok, ParsedRegExp} = parse(RegExp), + groups(S, ParsedRegExp); + +groups(S, ParsedRegExp) -> + case re_apply(S, 1, ParsedRegExp) of + {match, _RestPos, _Rest, Groups} -> + GetGroup = fun ({Start,Len}) -> lists:sublist(S,Start,Len) end, + {match, lists:map(GetGroup, Groups)}; + Other -> Other end.