1%% @author Bob Ippolito <bob@mochimedia.com>
2%% @copyright 2007 Mochi Media, Inc.
3
4%% @doc Utilities for parsing and quoting.
5
6-module(mochiweb_util).
7-author('bob@mochimedia.com').
8-export([join/2, quote_plus/1, urlencode/1, parse_qs/1, unquote/1]).
9-export([path_split/1]).
10-export([urlsplit/1, urlsplit_path/1, urlunsplit/1, urlunsplit_path/1]).
11-export([guess_mime/1, parse_header/1]).
12-export([shell_quote/1, cmd/1, cmd_string/1, cmd_port/2]).
13-export([record_to_proplist/2, record_to_proplist/3]).
14-export([to_lower/1]).
15-export([test/0]).
16
17-define(PERCENT, 37).  % $\%
18-define(FULLSTOP, 46). % $\.
19-define(IS_HEX(C), ((C >= $0 andalso C =< $9) orelse
20                    (C >= $a andalso C =< $f) orelse
21                    (C >= $A andalso C =< $F))).
22-define(QS_SAFE(C), ((C >= $a andalso C =< $z) orelse
23                     (C >= $A andalso C =< $Z) orelse
24                     (C >= $0 andalso C =< $9) orelse
25                     (C =:= ?FULLSTOP orelse C =:= $- orelse C =:= $~ orelse
26                      C =:= $_))).
27
28hexdigit(C) when C < 10 -> $0 + C;
29hexdigit(C) when C < 16 -> $A + (C - 10).
30
31unhexdigit(C) when C >= $0, C =< $9 -> C - $0;
32unhexdigit(C) when C >= $a, C =< $f -> C - $a + 10;
33unhexdigit(C) when C >= $A, C =< $F -> C - $A + 10.
34
35%% @spec shell_quote(string()) -> string()
36%% @doc Quote a string according to UNIX shell quoting rules, returns a string
37%%      surrounded by double quotes.
38shell_quote(L) ->
39    shell_quote(L, [$\"]).
40
41%% @spec cmd_port([string()], Options) -> port()
42%% @doc open_port({spawn, mochiweb_util:cmd_string(Argv)}, Options).
43cmd_port(Argv, Options) ->
44    open_port({spawn, cmd_string(Argv)}, Options).
45
46%% @spec cmd([string()]) -> string()
47%% @doc os:cmd(cmd_string(Argv)).
48cmd(Argv) ->
49    os:cmd(cmd_string(Argv)).
50
51%% @spec cmd_string([string()]) -> string()
52%% @doc Create a shell quoted command string from a list of arguments.
53cmd_string(Argv) ->
54    join([shell_quote(X) || X <- Argv], " ").
55
56%% @spec join([string()], Separator) -> string()
57%% @doc Join a list of strings together with the given separator
58%%      string or char.
59join([], _Separator) ->
60    [];
61join([S], _Separator) ->
62    lists:flatten(S);
63join(Strings, Separator) ->
64    lists:flatten(revjoin(lists:reverse(Strings), Separator, [])).
65
66revjoin([], _Separator, Acc) ->
67    Acc;
68revjoin([S | Rest], Separator, []) ->
69    revjoin(Rest, Separator, [S]);
70revjoin([S | Rest], Separator, Acc) ->
71    revjoin(Rest, Separator, [S, Separator | Acc]).
72
73%% @spec quote_plus(atom() | integer() | string()) -> string()
74%% @doc URL safe encoding of the given term.
75quote_plus(Atom) when is_atom(Atom) ->
76    quote_plus(atom_to_list(Atom));
77quote_plus(Int) when is_integer(Int) ->
78    quote_plus(integer_to_list(Int));
79quote_plus(String) ->
80    quote_plus(String, []).
81
82quote_plus([], Acc) ->
83    lists:reverse(Acc);
84quote_plus([C | Rest], Acc) when ?QS_SAFE(C) ->
85    quote_plus(Rest, [C | Acc]);
86quote_plus([$\s | Rest], Acc) ->
87    quote_plus(Rest, [$+ | Acc]);
88quote_plus([C | Rest], Acc) ->
89    <<Hi:4, Lo:4>> = <<C>>,
90    quote_plus(Rest, [hexdigit(Lo), hexdigit(Hi), ?PERCENT | Acc]).
91
92%% @spec urlencode([{Key, Value}]) -> string()
93%% @doc URL encode the property list.
94urlencode(Props) ->
95    RevPairs = lists:foldl(fun ({K, V}, Acc) ->
96                                   [[quote_plus(K), $=, quote_plus(V)] | Acc]
97                           end, [], Props),
98    lists:flatten(revjoin(RevPairs, $&, [])).
99
100%% @spec parse_qs(string() | binary()) -> [{Key, Value}]
101%% @doc Parse a query string or application/x-www-form-urlencoded.
102parse_qs(Binary) when is_binary(Binary) ->
103    parse_qs(binary_to_list(Binary));
104parse_qs(String) ->
105    parse_qs(String, []).
106
107parse_qs([], Acc) ->
108    lists:reverse(Acc);
109parse_qs(String, Acc) ->
110    {Key, Rest} = parse_qs_key(String),
111    {Value, Rest1} = parse_qs_value(Rest),
112    parse_qs(Rest1, [{Key, Value} | Acc]).
113
114parse_qs_key(String) ->
115    parse_qs_key(String, []).
116
117parse_qs_key([], Acc) ->
118    {qs_revdecode(Acc), ""};
119parse_qs_key([$= | Rest], Acc) ->
120    {qs_revdecode(Acc), Rest};
121parse_qs_key(Rest=[$; | _], Acc) ->
122    {qs_revdecode(Acc), Rest};
123parse_qs_key(Rest=[$& | _], Acc) ->
124    {qs_revdecode(Acc), Rest};
125parse_qs_key([C | Rest], Acc) ->
126    parse_qs_key(Rest, [C | Acc]).
127
128parse_qs_value(String) ->
129    parse_qs_value(String, []).
130
131parse_qs_value([], Acc) ->
132    {qs_revdecode(Acc), ""};
133parse_qs_value([$; | Rest], Acc) ->
134    {qs_revdecode(Acc), Rest};
135parse_qs_value([$& | Rest], Acc) ->
136    {qs_revdecode(Acc), Rest};
137parse_qs_value([C | Rest], Acc) ->
138    parse_qs_value(Rest, [C | Acc]).
139
140%% @spec unquote(string() | binary()) -> string()
141%% @doc Unquote a URL encoded string.
142unquote(Binary) when is_binary(Binary) ->
143    unquote(binary_to_list(Binary));
144unquote(String) ->
145    qs_revdecode(lists:reverse(String)).
146
147qs_revdecode(S) ->
148    qs_revdecode(S, []).
149
150qs_revdecode([], Acc) ->
151    Acc;
152qs_revdecode([$+ | Rest], Acc) ->
153    qs_revdecode(Rest, [$\s | Acc]);
154qs_revdecode([Lo, Hi, ?PERCENT | Rest], Acc) when ?IS_HEX(Lo), ?IS_HEX(Hi) ->
155    qs_revdecode(Rest, [(unhexdigit(Lo) bor (unhexdigit(Hi) bsl 4)) | Acc]);
156qs_revdecode([C | Rest], Acc) ->
157    qs_revdecode(Rest, [C | Acc]).
158
159%% @spec urlsplit(Url) -> {Scheme, Netloc, Path, Query, Fragment}
160%% @doc Return a 5-tuple, does not expand % escapes. Only supports HTTP style
161%%      URLs.
162urlsplit(Url) ->
163    {Scheme, Url1} = urlsplit_scheme(Url),
164    {Netloc, Url2} = urlsplit_netloc(Url1),
165    {Path, Query, Fragment} = urlsplit_path(Url2),
166    {Scheme, Netloc, Path, Query, Fragment}.
167
168urlsplit_scheme(Url) ->
169    urlsplit_scheme(Url, []).
170
171urlsplit_scheme([], Acc) ->
172    {"", lists:reverse(Acc)};
173urlsplit_scheme(":" ++ Rest, Acc) ->
174    {to_lower(lists:reverse(Acc)), Rest};
175urlsplit_scheme([C | Rest], Acc) ->
176    urlsplit_scheme(Rest, [C | Acc]).
177
178urlsplit_netloc("//" ++ Rest) ->
179    urlsplit_netloc(Rest, []);
180urlsplit_netloc(Path) ->
181    {"", Path}.
182
183urlsplit_netloc(Rest=[C | _], Acc) when C =:= $/; C =:= $?; C =:= $# ->
184    {lists:reverse(Acc), Rest};
185urlsplit_netloc([C | Rest], Acc) ->
186    urlsplit_netloc(Rest, [C | Acc]).
187
188
189%% @spec path_split(string()) -> {Part, Rest}
190%% @doc Split a path starting from the left, as in URL traversal.
191%%      path_split("foo/bar") = {"foo", "bar"},
192%%      path_split("/foo/bar") = {"", "foo/bar"}.
193path_split(S) ->
194    path_split(S, []).
195
196path_split("", Acc) ->
197    {lists:reverse(Acc), ""};
198path_split("/" ++ Rest, Acc) ->
199    {lists:reverse(Acc), Rest};
200path_split([C | Rest], Acc) ->
201    path_split(Rest, [C | Acc]).
202
203
204%% @spec urlunsplit({Scheme, Netloc, Path, Query, Fragment}) -> string()
205%% @doc Assemble a URL from the 5-tuple. Path must be absolute.
206urlunsplit({Scheme, Netloc, Path, Query, Fragment}) ->
207    lists:flatten([case Scheme of "" -> "";  _ -> [Scheme, "://"] end,
208                   Netloc,
209                   urlunsplit_path({Path, Query, Fragment})]).
210
211%% @spec urlunsplit_path({Path, Query, Fragment}) -> string()
212%% @doc Assemble a URL path from the 3-tuple.
213urlunsplit_path({Path, Query, Fragment}) ->
214    lists:flatten([Path,
215                   case Query of "" -> ""; _ -> [$? | Query] end,
216                   case Fragment of "" -> ""; _ -> [$# | Fragment] end]).
217
218%% @spec urlsplit_path(Url) -> {Path, Query, Fragment}
219%% @doc Return a 3-tuple, does not expand % escapes. Only supports HTTP style
220%%      paths.
221urlsplit_path(Path) ->
222    urlsplit_path(Path, []).
223
224urlsplit_path("", Acc) ->
225    {lists:reverse(Acc), "", ""};
226urlsplit_path("?" ++ Rest, Acc) ->
227    {Query, Fragment} = urlsplit_query(Rest),
228    {lists:reverse(Acc), Query, Fragment};
229urlsplit_path("#" ++ Rest, Acc) ->
230    {lists:reverse(Acc), "", Rest};
231urlsplit_path([C | Rest], Acc) ->
232    urlsplit_path(Rest, [C | Acc]).
233
234urlsplit_query(Query) ->
235    urlsplit_query(Query, []).
236
237urlsplit_query("", Acc) ->
238    {lists:reverse(Acc), ""};
239urlsplit_query("#" ++ Rest, Acc) ->
240    {lists:reverse(Acc), Rest};
241urlsplit_query([C | Rest], Acc) ->
242    urlsplit_query(Rest, [C | Acc]).
243
244%% @spec guess_mime(string()) -> string()
245%% @doc  Guess the mime type of a file by the extension of its filename.
246guess_mime(File) ->
247    case filename:extension(File) of
248        ".html" ->
249            "text/html";
250        ".xhtml" ->
251            "application/xhtml+xml";
252        ".xml" ->
253            "application/xml";
254        ".css" ->
255            "text/css";
256        ".js" ->
257            "application/x-javascript";
258        ".jpg" ->
259            "image/jpeg";
260        ".gif" ->
261            "image/gif";
262        ".png" ->
263            "image/png";
264        ".swf" ->
265            "application/x-shockwave-flash";
266        ".zip" ->
267            "application/zip";
268        ".bz2" ->
269            "application/x-bzip2";
270        ".gz" ->
271            "application/x-gzip";
272        ".tar" ->
273            "application/x-tar";
274        ".tgz" ->
275            "application/x-gzip";
276        ".txt" ->
277            "text/plain";
278        ".doc" ->
279            "application/msword";
280        ".pdf" ->
281            "application/pdf";
282        ".xls" ->
283            "application/vnd.ms-excel";
284        ".rtf" ->
285            "application/rtf";
286        ".mov" ->
287            "video/quicktime";
288        ".mp3" ->
289            "audio/mpeg";
290        ".z" ->
291            "application/x-compress";
292        ".wav" ->
293            "audio/x-wav";
294        ".ico" ->
295            "image/x-icon";
296        ".bmp" ->
297            "image/bmp";
298        ".m4a" ->
299            "audio/mpeg";
300        ".m3u" ->
301            "audio/x-mpegurl";
302        ".exe" ->
303            "application/octet-stream";
304        ".csv" ->
305            "text/csv";
306        _ ->
307            "text/plain"
308    end.
309
310%% @spec parse_header(string()) -> {Type, [{K, V}]}
311%% @doc  Parse a Content-Type like header, return the main Content-Type
312%%       and a property list of options.
313parse_header(String) ->
314    %% TODO: This is exactly as broken as Python's cgi module.
315    %%       Should parse properly like mochiweb_cookies.
316    [Type | Parts] = [string:strip(S) || S <- string:tokens(String, ";")],
317    F = fun (S, Acc) ->
318                case lists:splitwith(fun (C) -> C =/= $= end, S) of
319                    {"", _} ->
320                        %% Skip anything with no name
321                        Acc;
322                    {_, ""} ->
323                        %% Skip anything with no value
324                        Acc;
325                    {Name, [$\= | Value]} ->
326                        [{to_lower(string:strip(Name)),
327                          unquote_header(string:strip(Value))} | Acc]
328                end
329        end,
330    {to_lower(Type),
331     lists:foldr(F, [], Parts)}.
332
333unquote_header("\"" ++ Rest) ->
334    unquote_header(Rest, []);
335unquote_header(S) ->
336    S.
337
338unquote_header("", Acc) ->
339    lists:reverse(Acc);
340unquote_header("\"", Acc) ->
341    lists:reverse(Acc);
342unquote_header([$\\, C | Rest], Acc) ->
343    unquote_header(Rest, [C | Acc]);
344unquote_header([C | Rest], Acc) ->
345    unquote_header(Rest, [C | Acc]).
346
347%% @spec record_to_proplist(Record, Fields) -> proplist()
348%% @doc calls record_to_proplist/3 with a default TypeKey of '__record'
349record_to_proplist(Record, Fields) ->
350    record_to_proplist(Record, Fields, '__record').
351
352%% @spec record_to_proplist(Record, Fields, TypeKey) -> proplist()
353%% @doc Return a proplist of the given Record with each field in the
354%%      Fields list set as a key with the corresponding value in the Record.
355%%      TypeKey is the key that is used to store the record type
356%%      Fields should be obtained by calling record_info(fields, record_type)
357%%      where record_type is the record type of Record
358record_to_proplist(Record, Fields, TypeKey)
359  when is_tuple(Record),
360       is_list(Fields),
361       size(Record) - 1 =:= length(Fields) ->
362    lists:zip([TypeKey | Fields], tuple_to_list(Record)).
363
364
365shell_quote([], Acc) ->
366    lists:reverse([$\" | Acc]);
367shell_quote([C | Rest], Acc) when C =:= $\" orelse C =:= $\` orelse
368                                  C =:= $\\ orelse C =:= $\$ ->
369    shell_quote(Rest, [C, $\\ | Acc]);
370shell_quote([C | Rest], Acc) ->
371    shell_quote(Rest, [C | Acc]).
372
373to_lower_char(C) when is_integer(C),  C >= $A, C =< $Z ->
374    C + 32;
375to_lower_char(C) when is_integer(C),  C >= 16#C1, C =< 16#D6 ->
376    C + 32;
377to_lower_char(C) when is_integer(C),  C >= 16#D8, C =< 16#DE ->
378    C + 32;
379to_lower_char(C) ->
380    C.
381
382to_lower(S) when is_list(S) ->
383    [to_lower_char(C) || C <- S];
384to_lower(C) when is_integer(C) ->
385    to_lower_char(C).
386
387test() ->
388    test_join(),
389    test_quote_plus(),
390    test_unquote(),
391    test_urlencode(),
392    test_parse_qs(),
393    test_urlsplit_path(),
394    test_urlunsplit_path(),
395    test_urlsplit(),
396    test_urlunsplit(),
397    test_path_split(),
398    test_guess_mime(),
399    test_parse_header(),
400    test_shell_quote(),
401    test_cmd(),
402    test_cmd_string(),
403    ok.
404
405test_shell_quote() ->
406    "\"foo \\$bar\\\"\\`' baz\"" = shell_quote("foo $bar\"`' baz"),
407    ok.
408
409test_cmd() ->
410    "$bling$ `word`!\n" = cmd(["echo", "$bling$ `word`!"]),
411    ok.
412
413test_cmd_string() ->
414    "\"echo\" \"\\$bling\\$ \\`word\\`!\"" = cmd_string(["echo", "$bling$ `word`!"]),
415    ok.
416
417test_parse_header() ->
418    {"multipart/form-data", [{"boundary", "AaB03x"}]} =
419        parse_header("multipart/form-data; boundary=AaB03x"),
420    ok.
421
422test_guess_mime() ->
423    "text/plain" = guess_mime(""),
424    "text/plain" = guess_mime(".text"),
425    "application/zip" = guess_mime(".zip"),
426    "application/zip" = guess_mime("x.zip"),
427    "text/html" = guess_mime("x.html"),
428    "application/xhtml+xml" = guess_mime("x.xhtml"),
429    ok.
430
431test_path_split() ->
432    {"", "foo/bar"} = path_split("/foo/bar"),
433    {"foo", "bar"} = path_split("foo/bar"),
434    {"bar", ""} = path_split("bar"),
435    ok.
436
437test_urlsplit() ->
438    {"", "", "/foo", "", "bar?baz"} = urlsplit("/foo#bar?baz"),
439    {"http", "host:port", "/foo", "", "bar?baz"} =
440        urlsplit("http://host:port/foo#bar?baz"),
441    ok.
442
443test_urlsplit_path() ->
444    {"/foo/bar", "", ""} = urlsplit_path("/foo/bar"),
445    {"/foo", "baz", ""} = urlsplit_path("/foo?baz"),
446    {"/foo", "", "bar?baz"} = urlsplit_path("/foo#bar?baz"),
447    {"/foo", "", "bar?baz#wibble"} = urlsplit_path("/foo#bar?baz#wibble"),
448    {"/foo", "bar", "baz"} = urlsplit_path("/foo?bar#baz"),
449    {"/foo", "bar?baz", "baz"} = urlsplit_path("/foo?bar?baz#baz"),
450    ok.
451
452test_urlunsplit() ->
453    "/foo#bar?baz" = urlunsplit({"", "", "/foo", "", "bar?baz"}),
454    "http://host:port/foo#bar?baz" =
455        urlunsplit({"http", "host:port", "/foo", "", "bar?baz"}),
456    ok.
457
458test_urlunsplit_path() ->
459    "/foo/bar" = urlunsplit_path({"/foo/bar", "", ""}),
460    "/foo?baz" = urlunsplit_path({"/foo", "baz", ""}),
461    "/foo#bar?baz" = urlunsplit_path({"/foo", "", "bar?baz"}),
462    "/foo#bar?baz#wibble" = urlunsplit_path({"/foo", "", "bar?baz#wibble"}),
463    "/foo?bar#baz" = urlunsplit_path({"/foo", "bar", "baz"}),
464    "/foo?bar?baz#baz" = urlunsplit_path({"/foo", "bar?baz", "baz"}),
465    ok.
466
467test_join() ->
468    "foo,bar,baz" = join(["foo", "bar", "baz"], $,),
469    "foo,bar,baz" = join(["foo", "bar", "baz"], ","),
470    "foo bar" = join([["foo", " bar"]], ","),
471    "foo bar,baz" = join([["foo", " bar"], "baz"], ","),
472    "foo" = join(["foo"], ","),
473    "foobarbaz" = join(["foo", "bar", "baz"], ""),
474    ok.
475
476test_quote_plus() ->
477    "foo" = quote_plus(foo),
478    "1" = quote_plus(1),
479    "foo" = quote_plus("foo"),
480    "foo+bar" = quote_plus("foo bar"),
481    "foo%0A" = quote_plus("foo\n"),
482    "foo%0A" = quote_plus("foo\n"),
483    "foo%3B%26%3D" = quote_plus("foo;&="),
484    ok.
485
486test_unquote() ->
487    "foo bar" = unquote("foo+bar"),
488    "foo bar" = unquote("foo%20bar"),
489    "foo\r\n" = unquote("foo%0D%0A"),
490    ok.
491
492test_urlencode() ->
493    "foo=bar&baz=wibble+%0D%0A&z=1" = urlencode([{foo, "bar"},
494                                                 {"baz", "wibble \r\n"},
495                                                 {z, 1}]),
496    ok.
497
498test_parse_qs() ->
499    [{"foo", "bar"}, {"baz", "wibble \r\n"}, {"z", "1"}] =
500        parse_qs("foo=bar&baz=wibble+%0D%0A&z=1"),
501    ok.
502