1%% @author Bob Ippolito <bob@mochimedia.com>
2%% @copyright 2007 Mochi Media, Inc.
3
4%% @doc Utilities for parsing and quoting.
5
6-module(mochiweb_util).
7-author('bob@mochimedia.com').
8-export([join/2, quote_plus/1, urlencode/1, parse_qs/1, unquote/1]).
9-export([path_split/1]).
10-export([urlsplit/1, urlsplit_path/1, urlunsplit/1, urlunsplit_path/1]).
11-export([guess_mime/1, parse_header/1]).
12-export([shell_quote/1, cmd/1, cmd_string/1, cmd_port/2]).
13-export([record_to_proplist/2, record_to_proplist/3]).
14-export([safe_relative_path/1, partition/2]).
15-export([to_lower/1]).
16-export([test/0]).
17
18-define(PERCENT, 37).  % $\%
19-define(FULLSTOP, 46). % $\.
20-define(IS_HEX(C), ((C >= $0 andalso C =< $9) orelse
21                    (C >= $a andalso C =< $f) orelse
22                    (C >= $A andalso C =< $F))).
23-define(QS_SAFE(C), ((C >= $a andalso C =< $z) orelse
24                     (C >= $A andalso C =< $Z) orelse
25                     (C >= $0 andalso C =< $9) orelse
26                     (C =:= ?FULLSTOP orelse C =:= $- orelse C =:= $~ orelse
27                      C =:= $_))).
28
29hexdigit(C) when C < 10 -> $0 + C;
30hexdigit(C) when C < 16 -> $A + (C - 10).
31
32unhexdigit(C) when C >= $0, C =< $9 -> C - $0;
33unhexdigit(C) when C >= $a, C =< $f -> C - $a + 10;
34unhexdigit(C) when C >= $A, C =< $F -> C - $A + 10.
35
36%% @spec partition(String, Sep) -> {String, [], []} | {Prefix, Sep, Postfix}
37%% @doc Inspired by Python 2.5's str.partition:
38%%      partition("foo/bar", "/") = {"foo", "/", "bar"},
39%%      partition("foo", "/") = {"foo", "", ""}.
40partition(String, Sep) ->
41    case partition(String, Sep, []) of
42        undefined ->
43            {String, "", ""};
44        Result ->
45            Result
46    end.
47
48partition("", _Sep, _Acc) ->
49    undefined;
50partition(S, Sep, Acc) ->
51    case partition2(S, Sep) of
52        undefined ->
53            [C | Rest] = S,
54            partition(Rest, Sep, [C | Acc]);
55        Rest ->
56            {lists:reverse(Acc), Sep, Rest}
57    end.
58
59partition2(Rest, "") ->
60    Rest;
61partition2([C | R1], [C | R2]) ->
62    partition2(R1, R2);
63partition2(_S, _Sep) ->
64    undefined.
65
66
67
68%% @spec safe_relative_path(string()) -> string() | undefined
69%% @doc Return the reduced version of a relative path or undefined if it
70%%      is not safe. safe relative paths can be joined with an absolute path
71%%      and will result in a subdirectory of the absolute path.
72safe_relative_path("/" ++ _) ->
73    undefined;
74safe_relative_path(P) ->
75    safe_relative_path(P, []).
76
77safe_relative_path("", Acc) ->
78    case Acc of
79        [] ->
80            "";
81        _ ->
82            join(lists:reverse(Acc), "/")
83    end;
84safe_relative_path(P, Acc) ->
85    case partition(P, "/") of
86        {"", "/", _} ->
87            %% /foo or foo//bar
88            undefined;
89        {"..", _, _} when Acc =:= [] ->
90            undefined;
91        {"..", _, Rest} ->
92            safe_relative_path(Rest, tl(Acc));
93        {Part, "/", ""} ->
94            safe_relative_path("", ["", Part | Acc]);
95        {Part, _, Rest} ->
96            safe_relative_path(Rest, [Part | Acc])
97    end.
98
99%% @spec shell_quote(string()) -> string()
100%% @doc Quote a string according to UNIX shell quoting rules, returns a string
101%%      surrounded by double quotes.
102shell_quote(L) ->
103    shell_quote(L, [$\"]).
104
105%% @spec cmd_port([string()], Options) -> port()
106%% @doc open_port({spawn, mochiweb_util:cmd_string(Argv)}, Options).
107cmd_port(Argv, Options) ->
108    open_port({spawn, cmd_string(Argv)}, Options).
109
110%% @spec cmd([string()]) -> string()
111%% @doc os:cmd(cmd_string(Argv)).
112cmd(Argv) ->
113    os:cmd(cmd_string(Argv)).
114
115%% @spec cmd_string([string()]) -> string()
116%% @doc Create a shell quoted command string from a list of arguments.
117cmd_string(Argv) ->
118    join([shell_quote(X) || X <- Argv], " ").
119
120%% @spec join([string()], Separator) -> string()
121%% @doc Join a list of strings together with the given separator
122%%      string or char.
123join([], _Separator) ->
124    [];
125join([S], _Separator) ->
126    lists:flatten(S);
127join(Strings, Separator) ->
128    lists:flatten(revjoin(lists:reverse(Strings), Separator, [])).
129
130revjoin([], _Separator, Acc) ->
131    Acc;
132revjoin([S | Rest], Separator, []) ->
133    revjoin(Rest, Separator, [S]);
134revjoin([S | Rest], Separator, Acc) ->
135    revjoin(Rest, Separator, [S, Separator | Acc]).
136
137%% @spec quote_plus(atom() | integer() | float() | string() | binary()) -> string()
138%% @doc URL safe encoding of the given term.
139quote_plus(Atom) when is_atom(Atom) ->
140    quote_plus(atom_to_list(Atom));
141quote_plus(Int) when is_integer(Int) ->
142    quote_plus(integer_to_list(Int));
143quote_plus(Binary) when is_binary(Binary) ->
144    quote_plus(binary_to_list(Binary));
145quote_plus(Float) when is_float(Float) ->
146    quote_plus(mochinum:digits(Float));
147quote_plus(String) ->
148    quote_plus(String, []).
149
150quote_plus([], Acc) ->
151    lists:reverse(Acc);
152quote_plus([C | Rest], Acc) when ?QS_SAFE(C) ->
153    quote_plus(Rest, [C | Acc]);
154quote_plus([$\s | Rest], Acc) ->
155    quote_plus(Rest, [$+ | Acc]);
156quote_plus([C | Rest], Acc) ->
157    <<Hi:4, Lo:4>> = <<C>>,
158    quote_plus(Rest, [hexdigit(Lo), hexdigit(Hi), ?PERCENT | Acc]).
159
160%% @spec urlencode([{Key, Value}]) -> string()
161%% @doc URL encode the property list.
162urlencode(Props) ->
163    RevPairs = lists:foldl(fun ({K, V}, Acc) ->
164                                   [[quote_plus(K), $=, quote_plus(V)] | Acc]
165                           end, [], Props),
166    lists:flatten(revjoin(RevPairs, $&, [])).
167
168%% @spec parse_qs(string() | binary()) -> [{Key, Value}]
169%% @doc Parse a query string or application/x-www-form-urlencoded.
170parse_qs(Binary) when is_binary(Binary) ->
171    parse_qs(binary_to_list(Binary));
172parse_qs(String) ->
173    parse_qs(String, []).
174
175parse_qs([], Acc) ->
176    lists:reverse(Acc);
177parse_qs(String, Acc) ->
178    {Key, Rest} = parse_qs_key(String),
179    {Value, Rest1} = parse_qs_value(Rest),
180    parse_qs(Rest1, [{Key, Value} | Acc]).
181
182parse_qs_key(String) ->
183    parse_qs_key(String, []).
184
185parse_qs_key([], Acc) ->
186    {qs_revdecode(Acc), ""};
187parse_qs_key([$= | Rest], Acc) ->
188    {qs_revdecode(Acc), Rest};
189parse_qs_key(Rest=[$; | _], Acc) ->
190    {qs_revdecode(Acc), Rest};
191parse_qs_key(Rest=[$& | _], Acc) ->
192    {qs_revdecode(Acc), Rest};
193parse_qs_key([C | Rest], Acc) ->
194    parse_qs_key(Rest, [C | Acc]).
195
196parse_qs_value(String) ->
197    parse_qs_value(String, []).
198
199parse_qs_value([], Acc) ->
200    {qs_revdecode(Acc), ""};
201parse_qs_value([$; | Rest], Acc) ->
202    {qs_revdecode(Acc), Rest};
203parse_qs_value([$& | Rest], Acc) ->
204    {qs_revdecode(Acc), Rest};
205parse_qs_value([C | Rest], Acc) ->
206    parse_qs_value(Rest, [C | Acc]).
207
208%% @spec unquote(string() | binary()) -> string()
209%% @doc Unquote a URL encoded string.
210unquote(Binary) when is_binary(Binary) ->
211    unquote(binary_to_list(Binary));
212unquote(String) ->
213    qs_revdecode(lists:reverse(String)).
214
215qs_revdecode(S) ->
216    qs_revdecode(S, []).
217
218qs_revdecode([], Acc) ->
219    Acc;
220qs_revdecode([$+ | Rest], Acc) ->
221    qs_revdecode(Rest, [$\s | Acc]);
222qs_revdecode([Lo, Hi, ?PERCENT | Rest], Acc) when ?IS_HEX(Lo), ?IS_HEX(Hi) ->
223    qs_revdecode(Rest, [(unhexdigit(Lo) bor (unhexdigit(Hi) bsl 4)) | Acc]);
224qs_revdecode([C | Rest], Acc) ->
225    qs_revdecode(Rest, [C | Acc]).
226
227%% @spec urlsplit(Url) -> {Scheme, Netloc, Path, Query, Fragment}
228%% @doc Return a 5-tuple, does not expand % escapes. Only supports HTTP style
229%%      URLs.
230urlsplit(Url) ->
231    {Scheme, Url1} = urlsplit_scheme(Url),
232    {Netloc, Url2} = urlsplit_netloc(Url1),
233    {Path, Query, Fragment} = urlsplit_path(Url2),
234    {Scheme, Netloc, Path, Query, Fragment}.
235
236urlsplit_scheme(Url) ->
237    urlsplit_scheme(Url, []).
238
239urlsplit_scheme([], Acc) ->
240    {"", lists:reverse(Acc)};
241urlsplit_scheme(":" ++ Rest, Acc) ->
242    {to_lower(lists:reverse(Acc)), Rest};
243urlsplit_scheme([C | Rest], Acc) ->
244    urlsplit_scheme(Rest, [C | Acc]).
245
246urlsplit_netloc("//" ++ Rest) ->
247    urlsplit_netloc(Rest, []);
248urlsplit_netloc(Path) ->
249    {"", Path}.
250
251urlsplit_netloc(Rest=[C | _], Acc) when C =:= $/; C =:= $?; C =:= $# ->
252    {lists:reverse(Acc), Rest};
253urlsplit_netloc([C | Rest], Acc) ->
254    urlsplit_netloc(Rest, [C | Acc]).
255
256
257%% @spec path_split(string()) -> {Part, Rest}
258%% @doc Split a path starting from the left, as in URL traversal.
259%%      path_split("foo/bar") = {"foo", "bar"},
260%%      path_split("/foo/bar") = {"", "foo/bar"}.
261path_split(S) ->
262    path_split(S, []).
263
264path_split("", Acc) ->
265    {lists:reverse(Acc), ""};
266path_split("/" ++ Rest, Acc) ->
267    {lists:reverse(Acc), Rest};
268path_split([C | Rest], Acc) ->
269    path_split(Rest, [C | Acc]).
270
271
272%% @spec urlunsplit({Scheme, Netloc, Path, Query, Fragment}) -> string()
273%% @doc Assemble a URL from the 5-tuple. Path must be absolute.
274urlunsplit({Scheme, Netloc, Path, Query, Fragment}) ->
275    lists:flatten([case Scheme of "" -> "";  _ -> [Scheme, "://"] end,
276                   Netloc,
277                   urlunsplit_path({Path, Query, Fragment})]).
278
279%% @spec urlunsplit_path({Path, Query, Fragment}) -> string()
280%% @doc Assemble a URL path from the 3-tuple.
281urlunsplit_path({Path, Query, Fragment}) ->
282    lists:flatten([Path,
283                   case Query of "" -> ""; _ -> [$? | Query] end,
284                   case Fragment of "" -> ""; _ -> [$# | Fragment] end]).
285
286%% @spec urlsplit_path(Url) -> {Path, Query, Fragment}
287%% @doc Return a 3-tuple, does not expand % escapes. Only supports HTTP style
288%%      paths.
289urlsplit_path(Path) ->
290    urlsplit_path(Path, []).
291
292urlsplit_path("", Acc) ->
293    {lists:reverse(Acc), "", ""};
294urlsplit_path("?" ++ Rest, Acc) ->
295    {Query, Fragment} = urlsplit_query(Rest),
296    {lists:reverse(Acc), Query, Fragment};
297urlsplit_path("#" ++ Rest, Acc) ->
298    {lists:reverse(Acc), "", Rest};
299urlsplit_path([C | Rest], Acc) ->
300    urlsplit_path(Rest, [C | Acc]).
301
302urlsplit_query(Query) ->
303    urlsplit_query(Query, []).
304
305urlsplit_query("", Acc) ->
306    {lists:reverse(Acc), ""};
307urlsplit_query("#" ++ Rest, Acc) ->
308    {lists:reverse(Acc), Rest};
309urlsplit_query([C | Rest], Acc) ->
310    urlsplit_query(Rest, [C | Acc]).
311
312%% @spec guess_mime(string()) -> string()
313%% @doc  Guess the mime type of a file by the extension of its filename.
314guess_mime(File) ->
315    case filename:extension(File) of
316        ".html" ->
317            "text/html";
318        ".xhtml" ->
319            "application/xhtml+xml";
320        ".xml" ->
321            "application/xml";
322        ".css" ->
323            "text/css";
324        ".js" ->
325            "application/x-javascript";
326        ".jpg" ->
327            "image/jpeg";
328        ".gif" ->
329            "image/gif";
330        ".png" ->
331            "image/png";
332        ".swf" ->
333            "application/x-shockwave-flash";
334        ".zip" ->
335            "application/zip";
336        ".bz2" ->
337            "application/x-bzip2";
338        ".gz" ->
339            "application/x-gzip";
340        ".tar" ->
341            "application/x-tar";
342        ".tgz" ->
343            "application/x-gzip";
344        ".txt" ->
345            "text/plain";
346        ".doc" ->
347            "application/msword";
348        ".pdf" ->
349            "application/pdf";
350        ".xls" ->
351            "application/vnd.ms-excel";
352        ".rtf" ->
353            "application/rtf";
354        ".mov" ->
355            "video/quicktime";
356        ".mp3" ->
357            "audio/mpeg";
358        ".z" ->
359            "application/x-compress";
360        ".wav" ->
361            "audio/x-wav";
362        ".ico" ->
363            "image/x-icon";
364        ".bmp" ->
365            "image/bmp";
366        ".m4a" ->
367            "audio/mpeg";
368        ".m3u" ->
369            "audio/x-mpegurl";
370        ".exe" ->
371            "application/octet-stream";
372        ".csv" ->
373            "text/csv";
374        _ ->
375            "text/plain"
376    end.
377
378%% @spec parse_header(string()) -> {Type, [{K, V}]}
379%% @doc  Parse a Content-Type like header, return the main Content-Type
380%%       and a property list of options.
381parse_header(String) ->
382    %% TODO: This is exactly as broken as Python's cgi module.
383    %%       Should parse properly like mochiweb_cookies.
384    [Type | Parts] = [string:strip(S) || S <- string:tokens(String, ";")],
385    F = fun (S, Acc) ->
386                case lists:splitwith(fun (C) -> C =/= $= end, S) of
387                    {"", _} ->
388                        %% Skip anything with no name
389                        Acc;
390                    {_, ""} ->
391                        %% Skip anything with no value
392                        Acc;
393                    {Name, [$\= | Value]} ->
394                        [{to_lower(string:strip(Name)),
395                          unquote_header(string:strip(Value))} | Acc]
396                end
397        end,
398    {to_lower(Type),
399     lists:foldr(F, [], Parts)}.
400
401unquote_header("\"" ++ Rest) ->
402    unquote_header(Rest, []);
403unquote_header(S) ->
404    S.
405
406unquote_header("", Acc) ->
407    lists:reverse(Acc);
408unquote_header("\"", Acc) ->
409    lists:reverse(Acc);
410unquote_header([$\\, C | Rest], Acc) ->
411    unquote_header(Rest, [C | Acc]);
412unquote_header([C | Rest], Acc) ->
413    unquote_header(Rest, [C | Acc]).
414
415%% @spec record_to_proplist(Record, Fields) -> proplist()
416%% @doc calls record_to_proplist/3 with a default TypeKey of '__record'
417record_to_proplist(Record, Fields) ->
418    record_to_proplist(Record, Fields, '__record').
419
420%% @spec record_to_proplist(Record, Fields, TypeKey) -> proplist()
421%% @doc Return a proplist of the given Record with each field in the
422%%      Fields list set as a key with the corresponding value in the Record.
423%%      TypeKey is the key that is used to store the record type
424%%      Fields should be obtained by calling record_info(fields, record_type)
425%%      where record_type is the record type of Record
426record_to_proplist(Record, Fields, TypeKey)
427  when is_tuple(Record),
428       is_list(Fields),
429       size(Record) - 1 =:= length(Fields) ->
430    lists:zip([TypeKey | Fields], tuple_to_list(Record)).
431
432
433shell_quote([], Acc) ->
434    lists:reverse([$\" | Acc]);
435shell_quote([C | Rest], Acc) when C =:= $\" orelse C =:= $\` orelse
436                                  C =:= $\\ orelse C =:= $\$ ->
437    shell_quote(Rest, [C, $\\ | Acc]);
438shell_quote([C | Rest], Acc) ->
439    shell_quote(Rest, [C | Acc]).
440
441to_lower_char(C) when is_integer(C),  C >= $A, C =< $Z ->
442    C + 32;
443to_lower_char(C) when is_integer(C),  C >= 16#C1, C =< 16#D6 ->
444    C + 32;
445to_lower_char(C) when is_integer(C),  C >= 16#D8, C =< 16#DE ->
446    C + 32;
447to_lower_char(C) ->
448    C.
449
450to_lower(S) when is_list(S) ->
451    [to_lower_char(C) || C <- S];
452to_lower(C) when is_integer(C) ->
453    to_lower_char(C).
454
455test() ->
456    test_join(),
457    test_quote_plus(),
458    test_unquote(),
459    test_urlencode(),
460    test_parse_qs(),
461    test_urlsplit_path(),
462    test_urlunsplit_path(),
463    test_urlsplit(),
464    test_urlunsplit(),
465    test_path_split(),
466    test_guess_mime(),
467    test_parse_header(),
468    test_shell_quote(),
469    test_cmd(),
470    test_cmd_string(),
471    test_partition(),
472    test_safe_relative_path(),
473    ok.
474
475test_shell_quote() ->
476    "\"foo \\$bar\\\"\\`' baz\"" = shell_quote("foo $bar\"`' baz"),
477    ok.
478
479test_cmd() ->
480    "$bling$ `word`!\n" = cmd(["echo", "$bling$ `word`!"]),
481    ok.
482
483test_cmd_string() ->
484    "\"echo\" \"\\$bling\\$ \\`word\\`!\"" = cmd_string(["echo", "$bling$ `word`!"]),
485    ok.
486
487test_parse_header() ->
488    {"multipart/form-data", [{"boundary", "AaB03x"}]} =
489        parse_header("multipart/form-data; boundary=AaB03x"),
490    ok.
491
492test_guess_mime() ->
493    "text/plain" = guess_mime(""),
494    "text/plain" = guess_mime(".text"),
495    "application/zip" = guess_mime(".zip"),
496    "application/zip" = guess_mime("x.zip"),
497    "text/html" = guess_mime("x.html"),
498    "application/xhtml+xml" = guess_mime("x.xhtml"),
499    ok.
500
501test_path_split() ->
502    {"", "foo/bar"} = path_split("/foo/bar"),
503    {"foo", "bar"} = path_split("foo/bar"),
504    {"bar", ""} = path_split("bar"),
505    ok.
506
507test_urlsplit() ->
508    {"", "", "/foo", "", "bar?baz"} = urlsplit("/foo#bar?baz"),
509    {"http", "host:port", "/foo", "", "bar?baz"} =
510        urlsplit("http://host:port/foo#bar?baz"),
511    ok.
512
513test_urlsplit_path() ->
514    {"/foo/bar", "", ""} = urlsplit_path("/foo/bar"),
515    {"/foo", "baz", ""} = urlsplit_path("/foo?baz"),
516    {"/foo", "", "bar?baz"} = urlsplit_path("/foo#bar?baz"),
517    {"/foo", "", "bar?baz#wibble"} = urlsplit_path("/foo#bar?baz#wibble"),
518    {"/foo", "bar", "baz"} = urlsplit_path("/foo?bar#baz"),
519    {"/foo", "bar?baz", "baz"} = urlsplit_path("/foo?bar?baz#baz"),
520    ok.
521
522test_urlunsplit() ->
523    "/foo#bar?baz" = urlunsplit({"", "", "/foo", "", "bar?baz"}),
524    "http://host:port/foo#bar?baz" =
525        urlunsplit({"http", "host:port", "/foo", "", "bar?baz"}),
526    ok.
527
528test_urlunsplit_path() ->
529    "/foo/bar" = urlunsplit_path({"/foo/bar", "", ""}),
530    "/foo?baz" = urlunsplit_path({"/foo", "baz", ""}),
531    "/foo#bar?baz" = urlunsplit_path({"/foo", "", "bar?baz"}),
532    "/foo#bar?baz#wibble" = urlunsplit_path({"/foo", "", "bar?baz#wibble"}),
533    "/foo?bar#baz" = urlunsplit_path({"/foo", "bar", "baz"}),
534    "/foo?bar?baz#baz" = urlunsplit_path({"/foo", "bar?baz", "baz"}),
535    ok.
536
537test_join() ->
538    "foo,bar,baz" = join(["foo", "bar", "baz"], $,),
539    "foo,bar,baz" = join(["foo", "bar", "baz"], ","),
540    "foo bar" = join([["foo", " bar"]], ","),
541    "foo bar,baz" = join([["foo", " bar"], "baz"], ","),
542    "foo" = join(["foo"], ","),
543    "foobarbaz" = join(["foo", "bar", "baz"], ""),
544    ok.
545
546test_quote_plus() ->
547    "foo" = quote_plus(foo),
548    "1" = quote_plus(1),
549    "1.1" = quote_plus(1.1),
550    "foo" = quote_plus("foo"),
551    "foo+bar" = quote_plus("foo bar"),
552    "foo%0A" = quote_plus("foo\n"),
553    "foo%0A" = quote_plus("foo\n"),
554    "foo%3B%26%3D" = quote_plus("foo;&="),
555    ok.
556
557test_unquote() ->
558    "foo bar" = unquote("foo+bar"),
559    "foo bar" = unquote("foo%20bar"),
560    "foo\r\n" = unquote("foo%0D%0A"),
561    ok.
562
563test_urlencode() ->
564    "foo=bar&baz=wibble+%0D%0A&z=1" = urlencode([{foo, "bar"},
565                                                 {"baz", "wibble \r\n"},
566                                                 {z, 1}]),
567    ok.
568
569test_parse_qs() ->
570    [{"foo", "bar"}, {"baz", "wibble \r\n"}, {"z", "1"}] =
571        parse_qs("foo=bar&baz=wibble+%0D%0A&z=1"),
572    ok.
573
574test_partition() ->
575    {"foo", "", ""} = partition("foo", "/"),
576    {"foo", "/", "bar"} = partition("foo/bar", "/"),
577    {"foo", "/", ""} = partition("foo/", "/"),
578    {"", "/", "bar"} = partition("/bar", "/"),
579    {"f", "oo/ba", "r"} = partition("foo/bar", "oo/ba"),
580    ok.
581
582test_safe_relative_path() ->
583    "foo" = safe_relative_path("foo"),
584    "foo/" = safe_relative_path("foo/"),
585    "foo" = safe_relative_path("foo/bar/.."),
586    "bar" = safe_relative_path("foo/../bar"),
587    "bar/" = safe_relative_path("foo/../bar/"),
588    "" = safe_relative_path("foo/.."),
589    "" = safe_relative_path("foo/../"),
590    undefined = safe_relative_path("/foo"),
591    undefined = safe_relative_path("../foo"),
592    undefined = safe_relative_path("foo/../.."),
593    undefined = safe_relative_path("foo//"),
594    ok.
595