1%% @author Bob Ippolito <bob@mochimedia.com>
2%% @copyright 2007 Mochi Media, Inc.
3
4%% @doc Utilities for parsing and quoting.
5
6-module(mochiweb_util).
7-author('bob@mochimedia.com').
8-export([join/2, quote_plus/1, urlencode/1, parse_qs/1, unquote/1]).
9-export([path_split/1]).
10-export([urlsplit/1, urlsplit_path/1, urlunsplit/1, urlunsplit_path/1]).
11-export([guess_mime/1, parse_header/1]).
12-export([shell_quote/1, cmd/1, cmd_string/1, cmd_port/2]).
13-export([record_to_proplist/2, record_to_proplist/3]).
14-export([safe_relative_path/1, partition/2]).
15-export([test/0]).
16
17-define(PERCENT, 37).  % $\%
18-define(FULLSTOP, 46). % $\.
19-define(IS_HEX(C), ((C >= $0 andalso C =< $9) orelse
20                    (C >= $a andalso C =< $f) orelse
21                    (C >= $A andalso C =< $F))).
22-define(QS_SAFE(C), ((C >= $a andalso C =< $z) orelse
23                     (C >= $A andalso C =< $Z) orelse
24                     (C >= $0 andalso C =< $9) orelse
25                     (C =:= ?FULLSTOP orelse C =:= $- orelse C =:= $~ orelse
26                      C =:= $_))).
27
28hexdigit(C) when C < 10 -> $0 + C;
29hexdigit(C) when C < 16 -> $A + (C - 10).
30
31unhexdigit(C) when C >= $0, C =< $9 -> C - $0;
32unhexdigit(C) when C >= $a, C =< $f -> C - $a + 10;
33unhexdigit(C) when C >= $A, C =< $F -> C - $A + 10.
34
35%% @spec partition(String, Sep) -> {String, [], []} | {Prefix, Sep, Postfix}
36%% @doc Inspired by Python 2.5's str.partition:
37%%      partition("foo/bar", "/") = {"foo", "/", "bar"},
38%%      partition("foo", "/") = {"foo", "", ""}.
39partition(String, Sep) ->
40    case partition(String, Sep, []) of
41        undefined ->
42            {String, "", ""};
43        Result ->
44            Result
45    end.
46
47partition("", _Sep, _Acc) ->
48    undefined;
49partition(S, Sep, Acc) ->
50    case partition2(S, Sep) of
51        undefined ->
52            [C | Rest] = S,
53            partition(Rest, Sep, [C | Acc]);
54        Rest ->
55            {lists:reverse(Acc), Sep, Rest}
56    end.
57
58partition2(Rest, "") ->
59    Rest;
60partition2([C | R1], [C | R2]) ->
61    partition2(R1, R2);
62partition2(_S, _Sep) ->
63    undefined.
64
65
66
67%% @spec safe_relative_path(string()) -> string() | undefined
68%% @doc Return the reduced version of a relative path or undefined if it
69%%      is not safe. safe relative paths can be joined with an absolute path
70%%      and will result in a subdirectory of the absolute path.
71safe_relative_path("/" ++ _) ->
72    undefined;
73safe_relative_path(P) ->
74    safe_relative_path(P, []).
75
76safe_relative_path("", Acc) ->
77    case Acc of
78        [] ->
79            "";
80        _ ->
81            string:join(lists:reverse(Acc), "/")
82    end;
83safe_relative_path(P, Acc) ->
84    case partition(P, "/") of
85        {"", "/", _} ->
86            %% /foo or foo//bar
87            undefined;
88        {"..", _, _} when Acc =:= [] ->
89            undefined;
90        {"..", _, Rest} ->
91            safe_relative_path(Rest, tl(Acc));
92        {Part, "/", ""} ->
93            safe_relative_path("", ["", Part | Acc]);
94        {Part, _, Rest} ->
95            safe_relative_path(Rest, [Part | Acc])
96    end.
97
98%% @spec shell_quote(string()) -> string()
99%% @doc Quote a string according to UNIX shell quoting rules, returns a string
100%%      surrounded by double quotes.
101shell_quote(L) ->
102    shell_quote(L, [$\"]).
103
104%% @spec cmd_port([string()], Options) -> port()
105%% @doc open_port({spawn, mochiweb_util:cmd_string(Argv)}, Options).
106cmd_port(Argv, Options) ->
107    open_port({spawn, cmd_string(Argv)}, Options).
108
109%% @spec cmd([string()]) -> string()
110%% @doc os:cmd(cmd_string(Argv)).
111cmd(Argv) ->
112    os:cmd(cmd_string(Argv)).
113
114%% @spec cmd_string([string()]) -> string()
115%% @doc Create a shell quoted command string from a list of arguments.
116cmd_string(Argv) ->
117    join([shell_quote(X) || X <- Argv], " ").
118
119%% @spec join([string()], Separator) -> string()
120%% @doc Join a list of strings together with the given separator
121%%      string or char.
122join([], _Separator) ->
123    [];
124join([S], _Separator) ->
125    lists:flatten(S);
126join(Strings, Separator) ->
127    lists:flatten(revjoin(lists:reverse(Strings), Separator, [])).
128
129revjoin([], _Separator, Acc) ->
130    Acc;
131revjoin([S | Rest], Separator, []) ->
132    revjoin(Rest, Separator, [S]);
133revjoin([S | Rest], Separator, Acc) ->
134    revjoin(Rest, Separator, [S, Separator | Acc]).
135
136%% @spec quote_plus(atom() | integer() | float() | string() | binary()) -> string()
137%% @doc URL safe encoding of the given term.
138quote_plus(Atom) when is_atom(Atom) ->
139    quote_plus(atom_to_list(Atom));
140quote_plus(Int) when is_integer(Int) ->
141    quote_plus(integer_to_list(Int));
142quote_plus(Binary) when is_binary(Binary) ->
143    quote_plus(binary_to_list(Binary));
144quote_plus(Float) when is_float(Float) ->
145    quote_plus(mochinum:digits(Float));
146quote_plus(String) ->
147    quote_plus(String, []).
148
149quote_plus([], Acc) ->
150    lists:reverse(Acc);
151quote_plus([C | Rest], Acc) when ?QS_SAFE(C) ->
152    quote_plus(Rest, [C | Acc]);
153quote_plus([$\s | Rest], Acc) ->
154    quote_plus(Rest, [$+ | Acc]);
155quote_plus([C | Rest], Acc) ->
156    <<Hi:4, Lo:4>> = <<C>>,
157    quote_plus(Rest, [hexdigit(Lo), hexdigit(Hi), ?PERCENT | Acc]).
158
159%% @spec urlencode([{Key, Value}]) -> string()
160%% @doc URL encode the property list.
161urlencode(Props) ->
162    RevPairs = lists:foldl(fun ({K, V}, Acc) ->
163                                   [[quote_plus(K), $=, quote_plus(V)] | Acc]
164                           end, [], Props),
165    lists:flatten(revjoin(RevPairs, $&, [])).
166
167%% @spec parse_qs(string() | binary()) -> [{Key, Value}]
168%% @doc Parse a query string or application/x-www-form-urlencoded.
169parse_qs(Binary) when is_binary(Binary) ->
170    parse_qs(binary_to_list(Binary));
171parse_qs(String) ->
172    parse_qs(String, []).
173
174parse_qs([], Acc) ->
175    lists:reverse(Acc);
176parse_qs(String, Acc) ->
177    {Key, Rest} = parse_qs_key(String),
178    {Value, Rest1} = parse_qs_value(Rest),
179    parse_qs(Rest1, [{Key, Value} | Acc]).
180
181parse_qs_key(String) ->
182    parse_qs_key(String, []).
183
184parse_qs_key([], Acc) ->
185    {qs_revdecode(Acc), ""};
186parse_qs_key([$= | Rest], Acc) ->
187    {qs_revdecode(Acc), Rest};
188parse_qs_key(Rest=[$; | _], Acc) ->
189    {qs_revdecode(Acc), Rest};
190parse_qs_key(Rest=[$& | _], Acc) ->
191    {qs_revdecode(Acc), Rest};
192parse_qs_key([C | Rest], Acc) ->
193    parse_qs_key(Rest, [C | Acc]).
194
195parse_qs_value(String) ->
196    parse_qs_value(String, []).
197
198parse_qs_value([], Acc) ->
199    {qs_revdecode(Acc), ""};
200parse_qs_value([$; | Rest], Acc) ->
201    {qs_revdecode(Acc), Rest};
202parse_qs_value([$& | Rest], Acc) ->
203    {qs_revdecode(Acc), Rest};
204parse_qs_value([C | Rest], Acc) ->
205    parse_qs_value(Rest, [C | Acc]).
206
207%% @spec unquote(string() | binary()) -> string()
208%% @doc Unquote a URL encoded string.
209unquote(Binary) when is_binary(Binary) ->
210    unquote(binary_to_list(Binary));
211unquote(String) ->
212    qs_revdecode(lists:reverse(String)).
213
214qs_revdecode(S) ->
215    qs_revdecode(S, []).
216
217qs_revdecode([], Acc) ->
218    Acc;
219qs_revdecode([$+ | Rest], Acc) ->
220    qs_revdecode(Rest, [$\s | Acc]);
221qs_revdecode([Lo, Hi, ?PERCENT | Rest], Acc) when ?IS_HEX(Lo), ?IS_HEX(Hi) ->
222    qs_revdecode(Rest, [(unhexdigit(Lo) bor (unhexdigit(Hi) bsl 4)) | Acc]);
223qs_revdecode([C | Rest], Acc) ->
224    qs_revdecode(Rest, [C | Acc]).
225
226%% @spec urlsplit(Url) -> {Scheme, Netloc, Path, Query, Fragment}
227%% @doc Return a 5-tuple, does not expand % escapes. Only supports HTTP style
228%%      URLs.
229urlsplit(Url) ->
230    {Scheme, Url1} = urlsplit_scheme(Url),
231    {Netloc, Url2} = urlsplit_netloc(Url1),
232    {Path, Query, Fragment} = urlsplit_path(Url2),
233    {Scheme, Netloc, Path, Query, Fragment}.
234
235urlsplit_scheme(Url) ->
236    urlsplit_scheme(Url, []).
237
238urlsplit_scheme([], Acc) ->
239    {"", lists:reverse(Acc)};
240urlsplit_scheme(":" ++ Rest, Acc) ->
241    {string:to_lower(lists:reverse(Acc)), Rest};
242urlsplit_scheme([C | Rest], Acc) ->
243    urlsplit_scheme(Rest, [C | Acc]).
244
245urlsplit_netloc("//" ++ Rest) ->
246    urlsplit_netloc(Rest, []);
247urlsplit_netloc(Path) ->
248    {"", Path}.
249
250urlsplit_netloc(Rest=[C | _], Acc) when C =:= $/; C =:= $?; C =:= $# ->
251    {lists:reverse(Acc), Rest};
252urlsplit_netloc([C | Rest], Acc) ->
253    urlsplit_netloc(Rest, [C | Acc]).
254
255
256%% @spec path_split(string()) -> {Part, Rest}
257%% @doc Split a path starting from the left, as in URL traversal.
258%%      path_split("foo/bar") = {"foo", "bar"},
259%%      path_split("/foo/bar") = {"", "foo/bar"}.
260path_split(S) ->
261    path_split(S, []).
262
263path_split("", Acc) ->
264    {lists:reverse(Acc), ""};
265path_split("/" ++ Rest, Acc) ->
266    {lists:reverse(Acc), Rest};
267path_split([C | Rest], Acc) ->
268    path_split(Rest, [C | Acc]).
269
270
271%% @spec urlunsplit({Scheme, Netloc, Path, Query, Fragment}) -> string()
272%% @doc Assemble a URL from the 5-tuple. Path must be absolute.
273urlunsplit({Scheme, Netloc, Path, Query, Fragment}) ->
274    lists:flatten([case Scheme of "" -> "";  _ -> [Scheme, "://"] end,
275                   Netloc,
276                   urlunsplit_path({Path, Query, Fragment})]).
277
278%% @spec urlunsplit_path({Path, Query, Fragment}) -> string()
279%% @doc Assemble a URL path from the 3-tuple.
280urlunsplit_path({Path, Query, Fragment}) ->
281    lists:flatten([Path,
282                   case Query of "" -> ""; _ -> [$? | Query] end,
283                   case Fragment of "" -> ""; _ -> [$# | Fragment] end]).
284
285%% @spec urlsplit_path(Url) -> {Path, Query, Fragment}
286%% @doc Return a 3-tuple, does not expand % escapes. Only supports HTTP style
287%%      paths.
288urlsplit_path(Path) ->
289    urlsplit_path(Path, []).
290
291urlsplit_path("", Acc) ->
292    {lists:reverse(Acc), "", ""};
293urlsplit_path("?" ++ Rest, Acc) ->
294    {Query, Fragment} = urlsplit_query(Rest),
295    {lists:reverse(Acc), Query, Fragment};
296urlsplit_path("#" ++ Rest, Acc) ->
297    {lists:reverse(Acc), "", Rest};
298urlsplit_path([C | Rest], Acc) ->
299    urlsplit_path(Rest, [C | Acc]).
300
301urlsplit_query(Query) ->
302    urlsplit_query(Query, []).
303
304urlsplit_query("", Acc) ->
305    {lists:reverse(Acc), ""};
306urlsplit_query("#" ++ Rest, Acc) ->
307    {lists:reverse(Acc), Rest};
308urlsplit_query([C | Rest], Acc) ->
309    urlsplit_query(Rest, [C | Acc]).
310
311%% @spec guess_mime(string()) -> string()
312%% @doc  Guess the mime type of a file by the extension of its filename.
313guess_mime(File) ->
314    case filename:extension(File) of
315        ".html" ->
316            "text/html";
317        ".xhtml" ->
318            "application/xhtml+xml";
319        ".xml" ->
320            "application/xml";
321        ".css" ->
322            "text/css";
323        ".js" ->
324            "application/x-javascript";
325        ".jpg" ->
326            "image/jpeg";
327        ".gif" ->
328            "image/gif";
329        ".png" ->
330            "image/png";
331        ".swf" ->
332            "application/x-shockwave-flash";
333        ".zip" ->
334            "application/zip";
335        ".bz2" ->
336            "application/x-bzip2";
337        ".gz" ->
338            "application/x-gzip";
339        ".tar" ->
340            "application/x-tar";
341        ".tgz" ->
342            "application/x-gzip";
343        ".txt" ->
344            "text/plain";
345        ".doc" ->
346            "application/msword";
347        ".pdf" ->
348            "application/pdf";
349        ".xls" ->
350            "application/vnd.ms-excel";
351        ".rtf" ->
352            "application/rtf";
353        ".mov" ->
354            "video/quicktime";
355        ".mp3" ->
356            "audio/mpeg";
357        ".z" ->
358            "application/x-compress";
359        ".wav" ->
360            "audio/x-wav";
361        ".ico" ->
362            "image/x-icon";
363        ".bmp" ->
364            "image/bmp";
365        ".m4a" ->
366            "audio/mpeg";
367        ".m3u" ->
368            "audio/x-mpegurl";
369        ".exe" ->
370            "application/octet-stream";
371        ".csv" ->
372            "text/csv";
373        _ ->
374            "text/plain"
375    end.
376
377%% @spec parse_header(string()) -> {Type, [{K, V}]}
378%% @doc  Parse a Content-Type like header, return the main Content-Type
379%%       and a property list of options.
380parse_header(String) ->
381    %% TODO: This is exactly as broken as Python's cgi module.
382    %%       Should parse properly like mochiweb_cookies.
383    [Type | Parts] = [string:strip(S) || S <- string:tokens(String, ";")],
384    F = fun (S, Acc) ->
385                case lists:splitwith(fun (C) -> C =/= $= end, S) of
386                    {"", _} ->
387                        %% Skip anything with no name
388                        Acc;
389                    {_, ""} ->
390                        %% Skip anything with no value
391                        Acc;
392                    {Name, [$\= | Value]} ->
393                        [{string:to_lower(string:strip(Name)),
394                          unquote_header(string:strip(Value))} | Acc]
395                end
396        end,
397    {string:to_lower(Type),
398     lists:foldr(F, [], Parts)}.
399
400unquote_header("\"" ++ Rest) ->
401    unquote_header(Rest, []);
402unquote_header(S) ->
403    S.
404
405unquote_header("", Acc) ->
406    lists:reverse(Acc);
407unquote_header("\"", Acc) ->
408    lists:reverse(Acc);
409unquote_header([$\\, C | Rest], Acc) ->
410    unquote_header(Rest, [C | Acc]);
411unquote_header([C | Rest], Acc) ->
412    unquote_header(Rest, [C | Acc]).
413
414%% @spec record_to_proplist(Record, Fields) -> proplist()
415%% @doc calls record_to_proplist/3 with a default TypeKey of '__record'
416record_to_proplist(Record, Fields) ->
417    record_to_proplist(Record, Fields, '__record').
418
419%% @spec record_to_proplist(Record, Fields, TypeKey) -> proplist()
420%% @doc Return a proplist of the given Record with each field in the
421%%      Fields list set as a key with the corresponding value in the Record.
422%%      TypeKey is the key that is used to store the record type
423%%      Fields should be obtained by calling record_info(fields, record_type)
424%%      where record_type is the record type of Record
425record_to_proplist(Record, Fields, TypeKey)
426  when tuple_size(Record) - 1 =:= length(Fields) ->
427    lists:zip([TypeKey | Fields], tuple_to_list(Record)).
428
429
430shell_quote([], Acc) ->
431    lists:reverse([$\" | Acc]);
432shell_quote([C | Rest], Acc) when C =:= $\" orelse C =:= $\` orelse
433                                  C =:= $\\ orelse C =:= $\$ ->
434    shell_quote(Rest, [C, $\\ | Acc]);
435shell_quote([C | Rest], Acc) ->
436    shell_quote(Rest, [C | Acc]).
437
438test() ->
439    test_join(),
440    test_quote_plus(),
441    test_unquote(),
442    test_urlencode(),
443    test_parse_qs(),
444    test_urlsplit_path(),
445    test_urlunsplit_path(),
446    test_urlsplit(),
447    test_urlunsplit(),
448    test_path_split(),
449    test_guess_mime(),
450    test_parse_header(),
451    test_shell_quote(),
452    test_cmd(),
453    test_cmd_string(),
454    test_partition(),
455    test_safe_relative_path(),
456    ok.
457
458test_shell_quote() ->
459    "\"foo \\$bar\\\"\\`' baz\"" = shell_quote("foo $bar\"`' baz"),
460    ok.
461
462test_cmd() ->
463    "$bling$ `word`!\n" = cmd(["echo", "$bling$ `word`!"]),
464    ok.
465
466test_cmd_string() ->
467    "\"echo\" \"\\$bling\\$ \\`word\\`!\"" = cmd_string(["echo", "$bling$ `word`!"]),
468    ok.
469
470test_parse_header() ->
471    {"multipart/form-data", [{"boundary", "AaB03x"}]} =
472        parse_header("multipart/form-data; boundary=AaB03x"),
473    ok.
474
475test_guess_mime() ->
476    "text/plain" = guess_mime(""),
477    "text/plain" = guess_mime(".text"),
478    "application/zip" = guess_mime(".zip"),
479    "application/zip" = guess_mime("x.zip"),
480    "text/html" = guess_mime("x.html"),
481    "application/xhtml+xml" = guess_mime("x.xhtml"),
482    ok.
483
484test_path_split() ->
485    {"", "foo/bar"} = path_split("/foo/bar"),
486    {"foo", "bar"} = path_split("foo/bar"),
487    {"bar", ""} = path_split("bar"),
488    ok.
489
490test_urlsplit() ->
491    {"", "", "/foo", "", "bar?baz"} = urlsplit("/foo#bar?baz"),
492    {"http", "host:port", "/foo", "", "bar?baz"} =
493        urlsplit("http://host:port/foo#bar?baz"),
494    ok.
495
496test_urlsplit_path() ->
497    {"/foo/bar", "", ""} = urlsplit_path("/foo/bar"),
498    {"/foo", "baz", ""} = urlsplit_path("/foo?baz"),
499    {"/foo", "", "bar?baz"} = urlsplit_path("/foo#bar?baz"),
500    {"/foo", "", "bar?baz#wibble"} = urlsplit_path("/foo#bar?baz#wibble"),
501    {"/foo", "bar", "baz"} = urlsplit_path("/foo?bar#baz"),
502    {"/foo", "bar?baz", "baz"} = urlsplit_path("/foo?bar?baz#baz"),
503    ok.
504
505test_urlunsplit() ->
506    "/foo#bar?baz" = urlunsplit({"", "", "/foo", "", "bar?baz"}),
507    "http://host:port/foo#bar?baz" =
508        urlunsplit({"http", "host:port", "/foo", "", "bar?baz"}),
509    ok.
510
511test_urlunsplit_path() ->
512    "/foo/bar" = urlunsplit_path({"/foo/bar", "", ""}),
513    "/foo?baz" = urlunsplit_path({"/foo", "baz", ""}),
514    "/foo#bar?baz" = urlunsplit_path({"/foo", "", "bar?baz"}),
515    "/foo#bar?baz#wibble" = urlunsplit_path({"/foo", "", "bar?baz#wibble"}),
516    "/foo?bar#baz" = urlunsplit_path({"/foo", "bar", "baz"}),
517    "/foo?bar?baz#baz" = urlunsplit_path({"/foo", "bar?baz", "baz"}),
518    ok.
519
520test_join() ->
521    "foo,bar,baz" = join(["foo", "bar", "baz"], $,),
522    "foo,bar,baz" = join(["foo", "bar", "baz"], ","),
523    "foo bar" = join([["foo", " bar"]], ","),
524    "foo bar,baz" = join([["foo", " bar"], "baz"], ","),
525    "foo" = join(["foo"], ","),
526    "foobarbaz" = join(["foo", "bar", "baz"], ""),
527    ok.
528
529test_quote_plus() ->
530    "foo" = quote_plus(foo),
531    "1" = quote_plus(1),
532    "1.1" = quote_plus(1.1),
533    "foo" = quote_plus("foo"),
534    "foo+bar" = quote_plus("foo bar"),
535    "foo%0A" = quote_plus("foo\n"),
536    "foo%0A" = quote_plus("foo\n"),
537    "foo%3B%26%3D" = quote_plus("foo;&="),
538    ok.
539
540test_unquote() ->
541    "foo bar" = unquote("foo+bar"),
542    "foo bar" = unquote("foo%20bar"),
543    "foo\r\n" = unquote("foo%0D%0A"),
544    ok.
545
546test_urlencode() ->
547    "foo=bar&baz=wibble+%0D%0A&z=1" = urlencode([{foo, "bar"},
548                                                 {"baz", "wibble \r\n"},
549                                                 {z, 1}]),
550    ok.
551
552test_parse_qs() ->
553    [{"foo", "bar"}, {"baz", "wibble \r\n"}, {"z", "1"}] =
554        parse_qs("foo=bar&baz=wibble+%0D%0A&z=1"),
555    ok.
556
557test_partition() ->
558    {"foo", "", ""} = partition("foo", "/"),
559    {"foo", "/", "bar"} = partition("foo/bar", "/"),
560    {"foo", "/", ""} = partition("foo/", "/"),
561    {"", "/", "bar"} = partition("/bar", "/"),
562    {"f", "oo/ba", "r"} = partition("foo/bar", "oo/ba"),
563    ok.
564
565test_safe_relative_path() ->
566    "foo" = safe_relative_path("foo"),
567    "foo/" = safe_relative_path("foo/"),
568    "foo" = safe_relative_path("foo/bar/.."),
569    "bar" = safe_relative_path("foo/../bar"),
570    "bar/" = safe_relative_path("foo/../bar/"),
571    "" = safe_relative_path("foo/.."),
572    "" = safe_relative_path("foo/../"),
573    undefined = safe_relative_path("/foo"),
574    undefined = safe_relative_path("../foo"),
575    undefined = safe_relative_path("foo/../.."),
576    undefined = safe_relative_path("foo//"),
577    ok.
578