1import copy
2from documentgenerator import  DocumentGenerator
3import re
4import datetime
5import json
6import random, string
7import os
8import logger
9
10from data import COUNTRIES, COUNTRY_CODE, FIRST_NAMES, LAST_NAMES
11
12log = logger.Logger.get_logger()
13
14class TuqGenerators(object):
15
16    def __init__(self, log, full_set):
17        self.log = log
18        self.full_set = full_set
19        self.query = None
20        self.type_args = {}
21        self.nests = self._all_nested_objects(full_set[0])
22        self.type_args['str'] = [attr[0] for attr in full_set[0].iteritems()
23                            if isinstance(attr[1], unicode)]
24        self.type_args['int'] = [attr[0] for attr in full_set[0].iteritems()
25                            if isinstance(attr[1], int)]
26        self.type_args['float'] = [attr[0] for attr in full_set[0].iteritems()
27                            if isinstance(attr[1], float)]
28        self.type_args['bool'] = [attr[0] for attr in full_set[0].iteritems()
29                            if isinstance(attr[1], bool)]
30        self.type_args['list_str'] = [attr[0] for attr in full_set[0].iteritems()
31                            if isinstance(attr[1], list) and isinstance(attr[1][0], unicode)]
32        self.type_args['list_int'] = [attr[0] for attr in full_set[0].iteritems()
33                            if isinstance(attr[1], list) and isinstance(attr[1][0], int)]
34        self.type_args['list_obj'] = [attr[0] for attr in full_set[0].iteritems()
35                            if isinstance(attr[1], list) and isinstance(attr[1][0], dict)]
36        self.type_args['obj'] = [attr[0] for attr in full_set[0].iteritems()
37                             if isinstance(attr[1], dict)]
38        for obj in self.type_args['obj']:
39            self.type_args['_obj%s_str' % (self.type_args['obj'].index(obj))] = [attr[0] for attr in full_set[0][obj].iteritems()
40                                                                                    if isinstance(attr[1], str)]
41            self.type_args['_obj%s_int'% (self.type_args['obj'].index(obj))] = [attr[0] for attr in full_set[0][obj].iteritems()
42                                                                                    if isinstance(attr[1], int)]
43        for obj in self.type_args['list_obj']:
44            self.type_args['_list_obj%s_str' % (self.type_args['list_obj'].index(obj))] = [attr[0] for attr in full_set[0][obj][0].iteritems()
45                                                                                    if isinstance(attr[1], str) or isinstance(attr[1], unicode)]
46            self.type_args['_list_obj%s_int'% (self.type_args['list_obj'].index(obj))] = [attr[0] for attr in full_set[0][obj][0].iteritems()
47                                                                                    if isinstance(attr[1], int)]
48        for i in xrange(2, 5):
49            self.type_args['nested_%sl' % i] = [attr for attr in self.nests if len(attr.split('.')) == i]
50        for i in xrange(2, 5):
51            self.type_args['nested_list_%sl' % i] = [attr[0] for attr in self.nests.iteritems() if len(attr[0].split('.')) == i and isinstance(attr[1], list)]
52        self._clear_current_query()
53
54    def generate_query(self, template):
55        query = template
56        for name_type, type_arg in self.type_args.iteritems():
57            for attr_type_arg in type_arg:
58                query = query.replace('$%s%s' % (name_type, type_arg.index(attr_type_arg)), attr_type_arg)
59        for expr in [' where ', ' select ', ' from ', ' order by', ' limit ', 'end',
60                     ' offset ', ' count(' , 'group by', 'unnest', 'min', 'satisfies']:
61            query = query.replace(expr, expr.upper())
62        self.log.info("Generated query to be run: '''%s'''" % query)
63        self.query = query
64        return query
65
66    def generate_expected_result(self, print_expected_result = True):
67        try:
68            self._create_alias_map()
69            from_clause = self._format_from_clause()
70            log.info("FROM clause ===== is %s" % from_clause)
71            where_clause = self._format_where_clause(from_clause)
72            log.info("WHERE clause ===== is %s" % where_clause)
73            unnest_clause = self._format_unnest_clause(from_clause)
74            log.info("UNNEST clause ===== is %s" % unnest_clause)
75            select_clause = self._format_select_clause(from_clause)
76            log.info("SELECT clause ===== is %s" % select_clause)
77            result = self._filter_full_set(select_clause, where_clause, unnest_clause)
78            result = self._order_results(result)
79            result = self._limit_and_offset(result)
80            if print_expected_result:
81                log.info("Expected result is %s ..." % str(result[:15]))
82            return result
83        finally:
84            self._clear_current_query()
85
86    def _all_nested_objects(self, d):
87        def items():
88            for key, value in d.items():
89                if isinstance(value, dict):
90                    for subkey, subvalue in self._all_nested_objects(value).items():
91                        yield key + "." + subkey, subvalue
92                else:
93                    yield key, value
94        return dict(items())
95
96    def _create_alias_map(self):
97        query_dict = self.query.split()
98        for word in query_dict:
99            if word.upper() == 'AS':
100                self.aliases[query_dict[query_dict.index(word) + 1]] = query_dict[query_dict.index(word) - 1]
101
102    def _format_where_clause(self, from_clause=None):
103        if self.query.find('WHERE') == -1:
104            return None
105        clause = re.sub(r'ORDER BY.*', '', re.sub(r'.*WHERE', '', self.query))
106        clause = re.sub(r'GROUP BY.*', '', clause)
107        attributes = self.get_all_attributes()
108        conditions = clause.replace('IS NULL', 'is None')
109        conditions = conditions.replace('IS NOT NULL', 'is not None')
110        satisfy_expr = self.format_satisfy_clause()
111        if satisfy_expr:
112            conditions = re.sub(r'ANY.*END', '', conditions).strip()
113        regex = re.compile("[\w']+\.[\w']+")
114        atts = regex.findall(conditions)
115        for att in atts:
116            parent, child = att.split('.')
117            if parent in attributes:
118                conditions = conditions.replace(' %s.%s ' % (parent, child),
119                                                ' doc["%s"]["%s"] ' % (parent, child))
120            else:
121                if parent not in self.aliases:
122                    conditions = conditions.replace(' %s.%s ' % (parent, child),
123                                                ' doc["%s"] ' % (child))
124                elif self.aliases[parent] in attributes:
125                    conditions = conditions.replace(' %s.%s ' % (parent, child),
126                                                    ' doc["%s"]["%s"] ' % (self.aliases[parent], child))
127                else:
128                    conditions = conditions.replace(' %s.%s ' % (parent, child),
129                                                    ' doc["%s"] ' % (child))
130        for attr in attributes:
131            conditions = conditions.replace(' %s ' % attr, ' doc["%s"] ' % attr)
132        if satisfy_expr:
133            if conditions:
134                for join in ["AND", "OR"]:
135                    present = conditions.find(join)
136                    if present > -1:
137                        conditions = conditions.replace(join, join.lower())
138                        if present > 0:
139                            conditions += '' + satisfy_expr
140                            break
141                        else:
142                            conditions = satisfy_expr + ' ' + conditions
143                            break
144            else:
145                conditions += '' + satisfy_expr
146        if from_clause and from_clause.find('.') != -1:
147            sub_attrs = [att for name, group in self.type_args.iteritems()
148                         for att in group if att not in attributes]
149            for attr in sub_attrs:
150                conditions = conditions.replace(' %s ' % attr, ' doc["%s"] ' % attr)
151            conditions = conditions.replace('doc[', 'doc["%s"][' % from_clause.split('.')[-1])
152        conditions = conditions.replace(' = ', ' == ')
153        return conditions
154
155    def _format_from_clause(self):
156        clause = re.sub(r'ORDER BY.*', '', re.sub(r'.*FROM', '', self.query)).strip()
157        clause = re.sub(r'WHERE.*', '', re.sub(r'GROUP BY.*', '', clause)).strip()
158        clause = re.sub(r'SELECT.*', '', clause).strip()
159        if len(clause.split()) == 2:
160            self.aliases[clause.split()[1]] = clause.split()[0]
161        return clause
162
163    def _format_unnest_clause(self, from_clause):
164        if from_clause.find('UNNEST') == -1:
165            return None
166        clause = re.sub(r'.*UNNEST', '', from_clause)
167        attr = clause.split()
168        if len(attr) == 1:
169            clause = 'doc["%s"]' % attr[0]
170        elif len(attr) == 2:
171            attributes = self.get_all_attributes()
172            if attr[0].find('.') != -1:
173                splitted = attr[0].split('.')
174                if splitted[0] not in attributes:
175                    alias = [attr[0].split('.')[1],]
176                    clause = 'doc["%s"]' % attr[1]
177                    for inner in splitted[2:]:
178                        alias.append(inner)
179                    self.aliases[attr[1]] = tuple(alias)
180                    return clause
181                parent, child = attr[0].split('.')
182                if parent in attributes:
183                    clause = 'doc["%s"]["%s"]' % (parent, child)
184                    self.aliases[attr[1]] = (parent, child)
185                else:
186                    if parent not in self.aliases:
187                        clause = 'doc["%s"]' % (child)
188                        self.aliases[attr[1]] = child
189                    elif self.aliases[parent] in attributes:
190                        clause = 'doc["%s"]["%s"]' % (self.aliases[parent], child)
191                        self.aliases[attr[1]] = (self.aliases[parent], child)
192                    else:
193                        clause = 'doc["%s"]' % (child)
194                        self.aliases[attr[1]] = child
195            else:
196                clause = 'doc["%s"]' % attr[0]
197                self.aliases[attr[1]] = attr[0]
198        elif len(attr) == 3 and ('as' in attr or 'AS' in attr):
199            attributes = self.get_all_attributes()
200            if attr[0].find('.') != -1:
201                parent, child = attr[0].split('.')
202                if parent in attributes:
203                    clause = 'doc["%s"]["%s"]' % (parent, child)
204                    self.aliases[attr[2]] = (parent, child)
205                else:
206                    if parent not in self.aliases:
207                        clause = 'doc["%s"]' % (child)
208                        self.aliases[attr[2]] = child
209                    elif self.aliases[parent] in attributes:
210                        clause = 'doc["%s"]["%s"]' % (self.aliases[parent], child)
211                        self.aliases[attr[2]] = (self.aliases[parent], child)
212                    else:
213                        clause = 'doc["%s"]' % (child)
214                        self.aliases[attr[2]] = child
215            else:
216                clause = 'doc["%s"]' % attr[0]
217                self.aliases[attr[2]] = attr[0]
218        return clause
219
220    def _format_select_clause(self, from_clause=None):
221        select_clause = re.sub(r'ORDER BY.*', '', re.sub(r'.*SELECT', '', self.query)).strip()
222        select_clause = re.sub(r'WHERE.*', '', re.sub(r'FROM.*', '', select_clause)).strip()
223        select_attrs = select_clause.split(',')
224        if from_clause and from_clause.find('UNNEST') != -1:
225            from_clause = re.sub(r'UNNEST.*', '', from_clause).strip()
226        condition = '{'
227        #handle aliases
228        for attr_s in select_attrs:
229            attr = attr_s.split()
230            if re.match(r'COUNT\(.*\)', attr[0]):
231                    attr[0] = re.sub(r'\)', '', re.sub(r'.*COUNT\(', '', attr[0])).strip()
232                    self.aggr_fns['COUNT'] = {}
233                    if attr[0].upper() == 'DISTINCT':
234                        attr = attr[1:]
235                        self.distinct= True
236                    if attr[0].find('.') != -1:
237                        parent, child = attr[0].split('.')
238                        attr[0] = child
239                    if attr[0] in self.aliases:
240                        attr[0] = self.aliases[attr[0]]
241                    self.aggr_fns['COUNT']['field'] = attr[0]
242                    self.aggr_fns['COUNT']['alias'] = ('$1', attr[-1])[len(attr) > 1]
243                    if attr[0] == '*':
244                        condition += '"%s" : doc,' % attr[-1]
245                    continue
246            elif re.match(r'MIN\(.*\)', attr[0]):
247                    attr[0] = re.sub(r'\)', '', re.sub(r'.*MIN\(', '', attr[0])).strip()
248                    self.aggr_fns['MIN'] = {}
249                    if attr[0].find('.') != -1:
250                        parent, child = attr[0].split('.')
251                        attr[0] = child
252                    if attr[0] in self.aliases:
253                        attr[0] = self.aliases[attr[0]]
254                    self.aggr_fns['MIN']['field'] = attr[0]
255                    self.aggr_fns['MIN']['alias'] = ('$1', attr[-1])[len(attr) > 1]
256                    self.aliases[('$1', attr[-1])[len(attr) > 1]] = attr[0]
257                    condition += '"%s": doc["%s"]' % (self.aggr_fns['MIN']['alias'], self.aggr_fns['MIN']['field'])
258                    continue
259            elif attr[0].upper() == 'DISTINCT':
260                attr = attr[1:]
261                self.distinct= True
262            if attr[0] == '*':
263                condition += '"*" : doc,'
264            elif len(attr) == 1:
265                if attr[0].find('.') != -1:
266                    if attr[0].find('[') != -1:
267                        condition += '"%s" : doc["%s"]%s,' % (attr[0], attr[0][:attr[0].find('[')], attr[0][attr[0].find('['):])
268                    elif attr[0].split('.')[1] == '*':
269                        condition = 'doc["%s"]' % (attr[0].split('.')[0])
270                        return condition
271                    else:
272                        if attr[0].split('.')[0] not in self.get_all_attributes() and\
273                                        from_clause.find(attr[0].split('.')[0]) != -1:
274                            condition += '"%s" : doc["%s"],' % (attr[0].split('.')[1], attr[0].split('.')[1])
275                            continue
276                        else:
277                            condition += '"%s" : {%s : doc["%s"]["%s"]},' % (attr[0].split('.')[0], attr[0].split('.')[1],
278                                                                          attr[0].split('.')[0], attr[0].split('.')[1])
279                else:
280                    if attr[0].find('[') != -1:
281                        condition += '"%s" : doc["%s"]%s,' % (attr[0], attr[0][:attr[0].find('[')], attr[0][attr[0].find('['):])
282                    else:
283                        if attr[0] in self.aliases:
284                            value = self.aliases[attr[0]]
285                            if len(value) > 1:
286                                condition += '"%s" : doc["%s"]' % (attr[0], value[0])
287                                for inner in value[1:]:
288                                    condition += '["%s"]' % (inner)
289                                condition += ','
290                        else:
291                            condition += '"%s" : doc["%s"],' % (attr[0], attr[0])
292            elif len(attr) == 2:
293                if attr[0].find('.') != -1:
294                    condition += '"%s" : doc["%s"]["%s"],' % (attr[1], attr[0].split('.')[0], attr[0].split('.')[1])
295                else:
296                    condition += '"%s" : doc["%s"],' % (attr[1], attr[0])
297                self.aliases[attr[1]] = attr[0]
298            elif len(attr) == 3 and ('as' in attr or 'AS' in attr):
299                if attr[0].find('.') != -1:
300                    condition += '"%s" : doc["%s"]["%s"],' % (attr[2], attr[0].split('.')[0], attr[0].split('.')[1])
301                else:
302                    if attr[0].find('[') != -1:
303                        condition += '"%s" : doc["%s"]%s,' % (attr[2], attr[0][:attr[0].find('[')], attr[0][attr[0].find('['):])
304                    else:
305                        condition += '"%s" : doc["%s"],' % (attr[2], attr[0])
306        condition += '}'
307        if from_clause and from_clause.find('.') != -1:
308            condition = condition.replace('doc[', 'doc["%s"][' % from_clause.split('.')[-1])
309        return condition
310
311    def _filter_full_set(self, select_clause, where_clause, unnest_clause):
312        diff = self._order_clause_greater_than_select(select_clause)
313        if diff and not self._is_parent_selected(select_clause, diff) and not 'MIN' in self.query:
314            if diff[0].find('][') == -1:
315                select_clause = select_clause[:-1] + ','.join(['"%s" : %s' %([at.replace('"','') for at in re.compile('"\w+"').findall(attr)][0],
316                                                                            attr) for attr in self._order_clause_greater_than_select(select_clause)]) + '}'
317            else:
318                for attr in self._order_clause_greater_than_select(select_clause):
319                    select_clause = select_clause[:-1]
320                    for at in re.compile('"\w+"').findall(attr):
321                        if attr.find('][') != -1:
322                            attrs_split = [at.replace('"','') for at in re.compile('"\w+"').findall(attr)]
323                            select_clause = select_clause + '"%s" : {"%s" : %s},' %(attrs_split[0], attrs_split[1], attr)
324                        else:
325                            select_clause = select_clause + '"%s" : %s,' %([at.replace('"','') for at in re.compile('"\w+"').findall(attr)][0], attr)
326                    select_clause = select_clause + '}'
327        if where_clause:
328            result = [eval(select_clause) for doc in self.full_set if eval(where_clause)]
329        else:
330            result = [eval(select_clause) for doc in self.full_set]
331        if self.distinct:
332            result = [dict(y) for y in set(tuple(x.items()) for x in result)]
333        if unnest_clause:
334            unnest_attr = unnest_clause[5:-2]
335            if unnest_attr in self.aliases:
336                def res_generator():
337                    for doc in result:
338                        doc_temp = copy.deepcopy(doc)
339                        del doc_temp[unnest_attr]
340                        for item in eval(unnest_clause):
341                            doc_to_append = copy.deepcopy(doc_temp)
342                            doc_to_append[unnest_attr] = copy.deepcopy(item)
343                            yield doc_to_append
344                result = list(res_generator())
345            else:
346                result = [item for doc in result for item in eval(unnest_clause)]
347        if self._create_groups()[0]:
348            result = self._group_results(result)
349        if self.aggr_fns:
350            if not self._create_groups()[0] or len(result) == 0:
351                for fn_name, params in self.aggr_fns.iteritems():
352                    if fn_name == 'COUNT':
353                        result = [{params['alias'] : len(result)}]
354        return result
355
356    def _order_clause_greater_than_select(self, select_clause):
357        order_clause = self._get_order_clause()
358        if not order_clause:
359            return None
360        order_clause = order_clause.replace(',"', '"')
361        diff = set(order_clause.split(',')) - set(re.compile('doc\["[\w\']+"\]').findall(select_clause))
362        diff = [attr.replace(",",'"') for attr in diff if attr != '']
363        for k, v in self.aliases.iteritems():
364            if k.endswith(','):
365                self.aliases[k[:-1]] = v
366                del self.aliases[k]
367        if not set(diff) - set(['doc["%s"]' % alias for alias in self.aliases]):
368            return None
369        else:
370            diff = list(set(diff) - set(['doc["%s"]' % alias for alias in self.aliases]))
371        if diff:
372            self.attr_order_clause_greater_than_select = [re.sub(r'"\].*', '', re.sub(r'doc\["', '', attr)) for attr in diff]
373            self.attr_order_clause_greater_than_select = [attr for attr in self.attr_order_clause_greater_than_select if attr]
374            return list(diff)
375        return None
376
377    def _get_order_clause(self):
378        if self.query.find('ORDER BY') == -1:
379            return None
380        order_clause = re.sub(r'LIMIT.*', '', re.sub(r'.*ORDER BY', '', self.query)).strip()
381        order_clause = re.sub(r'OFFSET.*', '', order_clause).strip()
382        condition = ""
383        order_attrs = order_clause.split(',')
384        for attr_s in order_attrs:
385            attr = attr_s.split()
386            if attr[0] in self.aliases.itervalues():
387                    condition += 'doc["%s"],' % (self.get_alias_for(attr[0]))
388                    continue
389            if attr[0].find('MIN') != -1:
390                if 'MIN' not in self.aggr_fns:
391                    self.aggr_fns['MIN'] = {}
392                    attr[0]= attr[0][4:-1]
393                    self.aggr_fns['MIN']['field'] = attr[0]
394                    self.aggr_fns['MIN']['alias'] = '$gr1'
395                else:
396                    if 'alias' in self.aggr_fns['MIN']:
397                        condition += 'doc["%s"],' % self.aggr_fns['MIN']['alias']
398                        continue
399            if attr[0].find('.') != -1:
400                attributes = self.get_all_attributes()
401                if attr[0].split('.')[0] in self.aliases and (not self.aliases[attr[0].split('.')[0]] in attributes) or\
402                   attr[0].split('.')[0] in attributes:
403                    condition += 'doc["%s"]["%s"],' % (attr[0].split('.')[0],attr[0].split('.')[1])
404                else:
405                    if attr[0].split('.')[0].find('[') != -1:
406                        ind = attr[0].split('.')[0].index('[')
407                        condition += 'doc["%s"]%s["%s"],' % (attr[0].split('.')[0][:ind], attr[0].split('.')[0][ind:],
408                                                             attr[0].split('.')[1])
409                    else:
410                        condition += 'doc["%s"],' % attr[0].split('.')[1]
411            else:
412                if attr[0].find('[') != -1:
413                    ind = attr[0].index('[')
414                    condition += 'doc["%s"]%s,' % (attr[0].split('.')[0][:ind], attr[0].split('.')[0][ind:])
415                else:
416                    condition += 'doc["%s"],' % attr[0]
417        log.info("ORDER clause ========= is %s" % condition)
418        return condition
419
420    def _order_results(self, result):
421        order_clause = self._get_order_clause()
422        key = None
423        reverse = False
424        if order_clause:
425            all_order_clause = re.sub(r'LIMIT.*', '', re.sub(r'.*ORDER BY', '', self.query)).strip()
426            all_order_clause = re.sub(r'OFFSET.*', '', all_order_clause).strip()
427            order_attrs = all_order_clause.split(',')
428            for attr_s in order_attrs:
429                attr = attr_s.split()
430                if len(attr) == 2 and attr[1].upper() == 'DESC':
431                    reverse = True
432            for att_name in re.compile('"[\w\']+"').findall(order_clause):
433                if att_name[1:-1] in self.aliases.itervalues():
434                    order_clause = order_clause.replace(att_name[1:-1],
435                                                        self.get_alias_for(att_name[1:-1]))
436                if self.aggr_fns and att_name[1:-1] in [params['field'] for params in self.aggr_fns.itervalues()]:
437                    order_clause = order_clause.replace(att_name[1:-1],
438                                                        [params['alias'] for params in self.aggr_fns.itervalues()
439                                                         if params['field'] == att_name[1:-1]][0])
440            if order_clause.find(',"') != -1:
441                order_clause = order_clause.replace(',"', '"')
442            key = lambda doc: eval(order_clause)
443        try:
444            result = sorted(result, key=key, reverse=reverse)
445        except:
446            return result
447        if self.attr_order_clause_greater_than_select and not self.parent_selected:
448            for doc in result:
449                for attr in self.attr_order_clause_greater_than_select:
450                    if attr.find('.') != -1:
451                        attr = attr.split('.')[0]
452                    if attr in doc:
453                        del doc[attr]
454                    elif '$gr1' in doc:
455                        del doc['$gr1']
456        return result
457
458    def _limit_and_offset(self, result):
459        limit_clause = offset_clause = None
460        if self.query.find('LIMIT') != -1:
461            limit_clause = re.sub(r'OFFSET.*', '', re.sub(r'.*LIMIT', '', self.query)).strip()
462        if self.query.find('OFFSET') != -1:
463            offset_clause = re.sub(r'.*OFFSET', '', self.query).strip()
464        if offset_clause:
465            result = result[int(offset_clause):]
466        if limit_clause:
467            result = result[:int(limit_clause)]
468        return result
469
470    def _create_groups(self):
471        if self.query.find('GROUP BY') == -1:
472            return 0, None
473        group_clause = re.sub(r'ORDER BY.*', '', re.sub(r'.*GROUP BY', '', self.query)).strip()
474        if not group_clause:
475            return 0, None
476        attrs = group_clause.split(',')
477        attrs = [attr.strip() for attr in attrs]
478        if len(attrs) == 2:
479            groups = set([(doc[attrs[0]],doc[attrs[1]])  for doc in self.full_set])
480        elif len(attrs) == 1:
481            if attrs[0].find('.') != -1:
482                if len(attrs[0].split('.')) > 2:
483                    groups = set([doc[attrs[0].split('.')[1]][attrs[0].split('.')[2]]
484                              for doc in self.full_set])
485                else:
486                    groups = set([doc[attrs[0].split('.')[0]][attrs[0].split('.')[1]]
487                              for doc in self.full_set])
488            else:
489                groups = set([doc[attrs[0]]  for doc in self.full_set])
490        return attrs, groups
491
492    def _group_results(self, result):
493        attrs, groups = self._create_groups()
494        for fn_name, params in self.aggr_fns.iteritems():
495            if fn_name == 'COUNT':
496                result = [{attrs[0] : group[0], attrs[1] : group[1],
497                                params['alias'] : len([doc for doc in result
498                                if doc[attrs[0]]==group[0] and doc[attrs[1]]==group[1]])}
499                          for group in groups]
500                result = [doc for doc in result if doc[params['alias']] > 0]
501            if fn_name == 'MIN':
502                if isinstance(list(groups)[0], tuple):
503                    result = [{attrs[0] : group[0], attrs[1] : group[1],
504                                    params['alias'] : min([doc[params['field']] for doc in result
505                                    if doc[attrs[0]]==group[0] and doc[attrs[1]]==group[1]])}
506                              for group in groups]
507                else:
508                    if attrs[0] in self.aliases.itervalues():
509                        attrs[0] = self.get_alias_for(attrs[0]).replace(',', '')
510                    result = [{attrs[0] : group,
511                                params['alias'] : min([doc[params['alias']] for doc in result
512                                if doc[attrs[0]]==group])}
513                          for group in groups]
514        else:
515            result = [dict(y) for y in set(tuple(x.items()) for x in result)]
516        return result
517
518    def get_alias_for(self, value_search):
519        for key, value in self.aliases.iteritems():
520            if value == value_search:
521                return key
522        return ''
523
524    def get_all_attributes(self):
525        return [att for name, group in self.type_args.iteritems()
526                for att in group if not name.startswith('_')]
527
528    def _is_parent_selected(self, clause, diff):
529        self.parent_selected = len([select_el for select_el in re.compile('doc\["[\w\']+"\]').findall(clause)
530                for diff_el in diff if diff_el.find(select_el) != -1]) > 0
531        return self.parent_selected
532
533    def format_satisfy_clause(self):
534        if self.query.find('ANY') == -1 and self.query.find('EVERY') == -1:
535            return ''
536        satisfy_clause = re.sub(r'.*ANY', '', re.sub(r'END.*', '', self.query)).strip()
537        satisfy_clause = re.sub(r'.*ALL', '', re.sub(r'.*EVERY', '', satisfy_clause)).strip()
538        if not satisfy_clause:
539            return ''
540        main_attr = re.sub(r'SATISFIES.*', '', re.sub(r'.*IN', '', satisfy_clause)).strip()
541        attributes = self.get_all_attributes()
542        if main_attr in attributes:
543            main_attr = 'doc["%s"]' % (main_attr)
544        else:
545            if main_attr.find('.') != -1:
546                parent, child = main_attr.split('.')
547                if parent in self.aliases and self.aliases[parent] in attributes:
548                    main_attr = 'doc["%s"]["%s"]' % (self.aliases[parent], child)
549                else:
550                    main_attr = 'doc["%s"]' % (child)
551        var = "att"
552        if self.query.find('ANY') != -1:
553            var = re.sub(r'.*ANY', '', re.sub(r'IN.*', '', self.query)).strip()
554            result_clause = 'len([{0} for {1} in {2} if '.format(var, var, main_attr)
555        satisfy_expr = re.sub(r'.*SATISFIES', '', re.sub(r'END.*', '', satisfy_clause)).strip()
556        for expr in satisfy_expr.split():
557            if expr.find('.') != -1:
558                result_clause += ' {0}["{1}"] '.format(var, expr.split('.')[1])
559            elif expr.find('=') != -1:
560                result_clause += ' == '
561            elif expr.upper() in ['AND', 'OR', 'NOT']:
562                result_clause += expr.lower()
563            else:
564                result_clause += ' %s ' % expr
565        result_clause += ']) > 0'
566        return result_clause
567
568    def _clear_current_query(self):
569        self.distinct = False
570        self.aggr_fns = {}
571        self.aliases = {}
572        self.attr_order_clause_greater_than_select = []
573        self.parent_selected = False
574
575class JsonGenerator:
576
577    def generate_docs_employee(self, docs_per_day = 1, start=0, isShuffle = False):
578        generators = []
579        types = self._shuffle(['Engineer', 'Sales', 'Support'],isShuffle)
580        join_yr = self._shuffle([2010, 2011],isShuffle)
581        join_mo = self._shuffle(xrange(1, 12 + 1),isShuffle)
582        join_day = self._shuffle(xrange(1, 28 + 1),isShuffle)
583        template = '{{ "name":"{0}", "join_yr":{1}, "join_mo":{2}, "join_day":{3},'
584        template += ' "email":"{4}", "job_title":"{5}", "test_rate":{8}, "skills":{9},'
585        template += '"VMs": {10},'
586        template += ' "tasks_points" : {{"task1" : {6}, "task2" : {7}}}}}'
587        count = 1
588        for info in types:
589            for year in join_yr:
590                for month in join_mo:
591                    for day in join_day:
592                        random.seed(count)
593                        count+=1
594                        prefix = "employee"+str(random.random()*100000)
595                        name = ["employee-%s" % (str(day))]
596                        email = ["%s-mail@couchbase.com" % (str(day))]
597                        vms = [{"RAM": month, "os": "ubuntu",
598                                "name": "vm_%s" % month, "memory": month},
599                               {"RAM": month, "os": "windows",
600                                "name": "vm_%s"% (month + 1), "memory": month}
601                             ]
602                        generators.append(DocumentGenerator("query-test" + prefix,
603                                               template,
604                                               name, [year], [month], [day],
605                                               email, [info], range(1,10), range(1,10),
606                                               [float("%s.%s" % (month, month))],
607                                               [["skill%s" % y for y in join_yr]],
608                                               [vms],
609                                               start=start, end=docs_per_day))
610        return generators
611
612    def generate_docs_employee_array(self, docs_per_day = 1, start=0, isShuffle = False):
613        generators = []
614        #simple array
615        department = self._shuffle(['Developer', 'Support','HR','Tester','Manager'],isShuffle)
616        sport = ['Badminton','Cricket','Football','Basketball','American Football','ski']
617        dance = ['classical','bollywood','salsa','hip hop','contemporary','bhangra']
618        join_yr = self._shuffle([2010, 2011,2012,2013,2014,2015,2016],isShuffle)
619        join_mo = self._shuffle(xrange(1, 12 + 1),isShuffle)
620        join_day = self._shuffle(xrange(1, 28 + 1),isShuffle)
621        engineer = ["Query","Search","Indexing","Storage","Android","IOS"]
622        marketing = ["East","West","North","South","International"]
623        cities = ['Mumbai','Delhi','New York','San Francisco']
624        streets = ['21st street','12th street','18th street']
625        countries = ['USA','INDIA','EUROPE']
626        template = '{{ "name":{0}  , "department": "{1}" , "join_yr":{2},'
627        template += ' "email":"{3}", "hobbies": {{ "hobby" : {4} }},'
628        template += ' "tasks":  {5},  '
629        template += '"VMs": {6} , '
630        template += '"address" : {7} }}'
631        count = 1
632
633        for dept in department:
634            for month in join_mo:
635                for day in join_day:
636                    random.seed(count)
637                    count += 1
638                    prefix = "employee" + str(random.random() * 100000)
639                    # array of  single objects
640                    name = [{"FirstName": "employeefirstname-%s" % (str(day))},
641                            {"MiddleName": "employeemiddlename-%s" % (str(day))},
642                            {"LastName": "employeelastname-%s" % (str(day))}]
643
644                    # array inside array inside object
645                    sportValue = random.sample(sport, 3)
646                    danceValue = random.sample(dance, 3)
647                    hobbies = [{"sports": sportValue}, {"dance": danceValue},"art"]
648                    email = ["%s-mail@couchbase.com" % (str(day))]
649                    joining = random.sample(join_yr,3)
650                    # array inside array
651                    enggValue = random.sample(engineer, 2)
652                    marketingValue = [{"region1" :random.choice(marketing),"region2" :random.choice(marketing)},{"region2" :random.choice(marketing)}]
653                    taskValue = [{"Developer": enggValue,"Marketing": marketingValue},"Sales","QA"]
654                    # array of multiple objects
655                    vms = [{"RAM": month, "os": "ubuntu",
656                            "name": "vm_%s" % month, "memory": month},
657                           {"RAM": month, "os": "windows",
658                            "name": "vm_%s" % (month + 1), "memory": month},
659                           {"RAM": month, "os": "centos", "name": "vm_%s" % (month + 2), "memory": month},
660                           {"RAM": month, "os": "macos", "name": "vm_%s" % (month + 3), "memory": month}
661                           ]
662
663                    addressvalue = [[ {"city": random.choice(cities)},{"street":random.choice(streets)}],[{"apartment":123,"country":random.choice(countries)}]]
664
665                    generators.append(DocumentGenerator("query-test" + prefix,
666                                                        template,
667                                                        [name], [dept], [[ y for y in joining]],
668                                                        email, [hobbies],
669                                                        [taskValue],
670                                                        [vms],[addressvalue],
671                                                        start=start, end=docs_per_day))
672
673        return generators
674
675    def generate_docs_sabre(self, docs_per_day=1, start=0, isShuffle=False, years=2, indexes=[1,4,8]):
676        generators = []
677        all_airports = ["ABR", "ABI", "ATL","BOS", "BUR", "CHI", "MDW", "DAL", "SFO", "SAN", "SJC", "LGA", "JFK", "MSP",
678                        "MSQ", "MIA", "LON", "DUB"]
679        dests = [all_airports[i] for i in indexes]
680        join_yr = self._shuffle(xrange(2010, 2010 + years), isShuffle)
681        join_mo = self._shuffle(xrange(1, 12 + 1),isShuffle)
682        join_day = self._shuffle(xrange(1, 28 + 1),isShuffle)
683        template = '{{ "Amount":{0}, "CurrencyCode":"{1}",'
684        template += ' "TotalTax":{{"DecimalPlaces" : {2}, "Amount" : {3}, "CurrencyCode" : "{4}"}},'
685        template += ' "Tax":{5}, "FareBasisCode":{6}, "PassengerTypeQuantity":{7}, "TicketType":"{8}",'
686        template += '"SequenceNumber": {9},'
687        template += ' "DirectionInd" : "{10}",  "Itinerary" : {11}, "Destination" : "{12}",'
688        template += '"join_yr":{13}, "join_mo":{14}, "join_day":{15}, "Codes":{16}}}'
689        count = 1
690        for dest in dests:
691            for year in join_yr:
692                for month in join_mo:
693                    for day in join_day:
694                        random.seed(count)
695                        count +=1
696                        prefix = '%s_%s-%s-%s' % (dest, year, month, day)
697                        amount = [float("%s.%s" % (month, month))]
698                        currency = [("USD", "EUR")[month in [1,3,5]]]
699                        decimal_tax = [1,2]
700                        amount_tax = [day]
701                        currency_tax = currency
702                        taxes = [{"DecimalPlaces": 2, "Amount": float(amount_tax[0])/3,
703                                  "TaxCode": "US1", "CurrencyCode": currency},
704                                 {"DecimalPlaces": 2, "Amount": float(amount_tax[0])/4,
705                                  "TaxCode": "US2", "CurrencyCode": currency},
706                                 {"DecimalPlaces": 2, "Amount": amount_tax[0] - float(amount_tax[0])/4-\
707                                  float(amount_tax[0])/3,
708                                  "TaxCode": "US2", "CurrencyCode": currency}]
709
710                        fare_basis = [{"content": "XA21A0NY", "DepartureAirportCode": dest,
711                                       "BookingCode": "X", "ArrivalAirportCode": "MSP"},
712                                      {"content": "XA21A0NY", "DepartureAirportCode": "MSP",
713                                       "AvailabilityBreak": True, "BookingCode": "X",
714                                       "ArrivalAirportCode": "BOS"}]
715                        pass_amount = [day]
716                        ticket_type = [("eTicket", "testType")[month in [1,3,5]]]
717                        sequence = [year]
718                        direction = [("oneWay", "return")[month in [2,6,10]]]
719                        itinerary = {"OriginDestinationOptions":
720                                     {"OriginDestinationOption": [
721                                       {"FlightSegment": [
722                                         {"TPA_Extensions":
723                                           {"eTicket": {"Ind": True}},
724                                           "MarketingAirline": {"Code": dest},
725                                           "StopQuantity": month,
726                                           "DepartureTimeZone": {"GMTOffset": -7},
727                                           "OperatingAirline": {"Code": "DL",
728                                                                "FlightNumber": year + month},
729                                           "DepartureAirport": {"LocationCode": "SFO"},
730                                           "ArrivalTimeZone": {"GMTOffset": -5},
731                                           "ResBookDesigCode": "X",
732                                           "FlightNumber": year + day,
733                                           "ArrivalDateTime": "2014-07-12T06:07:00",
734                                           "ElapsedTime": 212,
735                                           "Equipment": {"AirEquipType": 763},
736                                           "DepartureDateTime": "2014-07-12T00:35:00",
737                                           "MarriageGrp": "O",
738                                           "ArrivalAirport": {"LocationCode": random.sample(all_airports, 1)}},
739                                        {"TPA_Extensions":
740                                           {"eTicket": {"Ind": False}},
741                                           "MarketingAirline": {"Code": dest},
742                                           "StopQuantity": month,
743                                           "DepartureTimeZone": {"GMTOffset": -7},
744                                           "OperatingAirline": {"Code": "DL",
745                                                                "FlightNumber": year + month + 1},
746                                           "DepartureAirport": {"LocationCode": random.sample(all_airports, 1)},
747                                           "ArrivalTimeZone": {"GMTOffset": -3},
748                                           "ResBookDesigCode": "X",
749                                           "FlightNumber": year + day,
750                                           "ArrivalDateTime": "2014-07-12T06:07:00",
751                                           "ElapsedTime": 212,
752                                           "Equipment": {"AirEquipType": 764},
753                                           "DepartureDateTime": "2014-07-12T00:35:00",
754                                           "MarriageGrp": "1",
755                                           "ArrivalAirport": {"LocationCode": random.sample(all_airports, 1)}}],
756                                    "ElapsedTime": 619},
757                                   {"FlightSegment": [
758                                         {"TPA_Extensions":
759                                           {"eTicket": {"Ind": True}},
760                                           "MarketingAirline": {"Code": dest},
761                                           "StopQuantity": month,
762                                           "DepartureTimeZone": {"GMTOffset": -7},
763                                           "OperatingAirline": {"Code": "DL",
764                                                                "FlightNumber": year + month},
765                                           "DepartureAirport": {"LocationCode": random.sample(all_airports, 1)},
766                                           "ArrivalTimeZone": {"GMTOffset": -5},
767                                           "ResBookDesigCode": "X",
768                                           "FlightNumber": year + day,
769                                           "ArrivalDateTime": "2014-07-12T06:07:00",
770                                           "ElapsedTime": 212,
771                                           "Equipment": {"AirEquipType": 763},
772                                           "DepartureDateTime": "2014-07-12T00:35:00",
773                                           "MarriageGrp": "O",
774                                           "ArrivalAirport": {"LocationCode": random.sample(all_airports, 1)}},
775                                        {"TPA_Extensions":
776                                           {"eTicket": {"Ind": False}},
777                                           "MarketingAirline": {"Code": dest},
778                                           "StopQuantity": month,
779                                           "DepartureTimeZone": {"GMTOffset": -7},
780                                           "OperatingAirline": {"Code": "DL",
781                                                                "FlightNumber": year + month + 1},
782                                           "DepartureAirport": {"LocationCode": random.sample(all_airports, 1)},
783                                           "ArrivalTimeZone": {"GMTOffset": -3},
784                                           "ResBookDesigCode": "X",
785                                           "FlightNumber": year + day,
786                                           "ArrivalDateTime": "2014-07-12T06:07:00",
787                                           "ElapsedTime": 212,
788                                           "Equipment": {"AirEquipType": 764},
789                                           "DepartureDateTime": "2014-07-12T00:35:00",
790                                           "MarriageGrp": "1",
791                                           "ArrivalAirport": {"LocationCode": random.sample(all_airports, 1)}}]}]},
792                                     "DirectionInd": "Return"}
793                        generators.append(DocumentGenerator(prefix, template,
794                                               amount, currency, decimal_tax, amount_tax, currency_tax,
795                                               [taxes], [fare_basis], pass_amount, ticket_type, sequence,
796                                               direction, [itinerary], [dest], [year], [month], [day],
797                                               [[dest, dest]], start=start, end=docs_per_day))
798        return generators
799
800    def generate_docs_sales(self, key_prefix = "sales_dataset", test_data_type = True, start=0, docs_per_day=None, isShuffle = False):
801        generators = []
802        if end is None:
803            end = self.docs_per_day
804        join_yr = self._shuffle(range(2008, 2008 + self.years),isShuffle)
805        join_mo = self._shuffle(range(1, self.months + 1),isShuffle)
806        join_day = self._shuffle(range(1, self.days + 1),isShuffle)
807        count = 1
808        if test_data_type:
809            template = '{{ "join_yr" : {0}, "join_mo" : {1}, "join_day" : {2},'
810            template += ' "sales" : {3}, "delivery_date" : "{4}", "is_support_included" : {5},'
811            template += ' "is_high_priority_client" : {6}, "client_contact" :  "{7}",'
812            template += ' "client_name" : "{8}", "client_reclaims_rate" : {9}}}'
813            sales = self._shuffle([200000, 400000, 600000, 800000],isShuffle)
814
815            is_support = self._shuffle(['true', 'false'],isShuffle)
816            is_priority = self._shuffle(['true', 'false'],isShuffle)
817            contact = "contact_"+str(random.random()*10000000)
818            name ="name_"+str(random.random()*100000)
819            rate = [x * 0.1 for x in range(0, 10)]
820            for year in join_yr:
821                for month in join_mo:
822                    for day in join_day:
823                        random.seed(count)
824                        count +=1
825                        prefix = "prefix_"+str(random.random()*100000)
826                        delivery = str(datetime.date(year, month, day))
827                        generators.append(DocumentGenerator(key_prefix + prefix,
828                                                  template,
829                                                  [year], [month], [day],
830                                                  sales, [delivery], is_support,
831                                                  is_priority, [contact],
832                                                  [name], rate,
833                                                  start=start, end=end))
834        return generators
835
836    def generate_docs_bigdata(self, key_prefix = "big_dataset", value_size = 1024, start=0, docs_per_day=1, end=None):
837        if end is None:
838            end = docs_per_day
839        age = range(start, end)
840        name = ['a' * value_size,]
841        template = '{{ "age": {0}, "name": "{1}" }}'
842
843        gen_load = DocumentGenerator(key_prefix, template, age, name, start=start,
844                                     end=end)
845        return [gen_load]
846
847    def generate_docs_simple(self, key_prefix ="simple_dataset", start=0, docs_per_day = 1000, isShuffle = False):
848        end = docs_per_day
849        age = self._shuffle(range(start, end), isShuffle)
850        name = [key_prefix + '-' + str(i) for i in self._shuffle(xrange(start, end), isShuffle)]
851        template = '{{ "age": {0}, "name": "{1}" }}'
852        gen_load = DocumentGenerator(key_prefix, template, age, name, start=start, end=end)
853        return [gen_load]
854
855    def generate_docs_array(self, key_prefix="array_dataset", start=0, docs_per_day=1, isShuffle=False):
856        COUNTRIES = ["India", "US", "UK", "Japan", "France", "Germany", "China", "Korea", "Canada", "Cuba",
857             "West Indies", "Australia", "New Zealand", "Nepal", "Sri Lanka", "Pakistan", "Mexico",
858             "belgium", "Netherlands", "Brazil", "Costa Rica", "Cambodia", "Fiji", "Finland", "haiti",
859             "Hong Kong", "Iceland", "Iran", "Iraq", "Italy", "Greece", "Jamaica", "Kenya", "Kuwait", "Macau",
860             "Spain","Morocco", "Maldives", "Norway"]
861
862        COUNTRY_CODE = ["Ind123", "US123", "UK123", "Jap123", "Fra123", "Ger123", "Chi123", "Kor123", "Can123",
863                "Cub123", "Wes123", "Aus123", "New123", "Nep123", "Sri123", "Pak123", "Mex123", "bel123",
864                "Net123", "Bra123", "Cos123", "Cam123", "Fij123", "Fin123", "hai123", "Hon123", "Ice123",
865                "Ira123", "Ira123", "Ita123", "Gre123", "Jam123", "Ken123", "Kuw123", "Mac123", "Spa123",
866                "Mor123", "Mal123", "Nor123"]
867        end = docs_per_day
868        generators = []
869        template = '{{"name": "{0}", "email": "{1}", \
870                   "countries": {2}, "code": {3}}}'
871        for i in range(start, end):
872            countries = []
873            codes = []
874            name = ["Passenger-{0}".format(i)]
875            email = ["passenger_{0}@abc.com".format(i)]
876            start_pnt = random.randint(0, len(COUNTRIES)-2)
877            end_pnt = random.randint(start_pnt, len(COUNTRIES)-1)
878            cnt = COUNTRIES[start_pnt:end_pnt]
879            countries.append(cnt)
880            cde = COUNTRY_CODE[start_pnt:end_pnt]
881            codes.append(cde)
882            prefix = "{0}-{1}".format(key_prefix,i)
883            generators.append(DocumentGenerator(prefix, template,
884                                                name, email, countries, codes,  start=start, end=end))
885        return generators
886
887    def generate_all_type_documents_for_gsi(self, start=0, docs_per_day=10):
888        """
889        Document fields:
890        name: String
891        age: Number
892        email: Alphanumeric + Special Character
893        premium_customer: Boolean or <NULL>
894        Address: Object
895                {Line 1: Alphanumeric + Special Character
896                Line 2: Alphanumeric + Special Character or <NULL>
897                City: String
898                Country: String
899                postal_code: Number
900                }
901        travel_history: Array of string - Duplicate elements ["India", "US", "UK", "India"]
902        travel_history_code: Array of alphanumerics - Duplicate elements
903        booking_history: Array of objects
904                        {source:
905                         destination:
906                          }
907        credit_cards: Array of numbers
908        secret_combination: Array of mixed data types
909        countries_visited: Array of strings - non-duplicate elements
910
911        :param start:
912        :param docs_per_day:
913        :param isShuffle:
914        :return:
915        """
916        generators = []
917        bool_vals = [True, False]
918        template = r'{{ "name":"{0}", "email":"{1}", "age":{2}, "premium_customer":{3}, ' \
919                   '"address":{4}, "travel_history":{5}, "travel_history_code":{6}, "travel_details":{7},' \
920                   '"booking":{8}, "credit_cards":{9}, "secret_combination":{10}, "countries_visited":{11}, ' \
921                   '"question_values":{12}}}'
922        for i in range(docs_per_day):
923            name = random.choice(FIRST_NAMES)
924            age = random.randint(25, 70)
925            last_name = random.choice(LAST_NAMES)
926            dob = "{0}-{1}-{2}".format(random.randint(1970, 1999),
927                                       random.randint(1, 28), random.randint(1, 12))
928            email = "{0}.{1}.{2}@abc.com".format(name, last_name, dob.split("-")[1])
929            premium_customer = random.choice(bool_vals)
930            address = {}
931            address["line_1"] = "Street No. {0}".format(random.randint(100, 200))
932            address["line_2"] = "null"
933            if not random.choice(bool_vals):
934                address["address2"] = "Building {0}".format(random.randint(1, 6))
935            address["city"] = "Bangalore"
936            address["contact"] = "{0} {1}".format(name, last_name)
937            address["country"] = "India"
938            address["postal_code"] = "{0}".format(random.randint(560071, 560090))
939            credit_cards = [random.randint(-1000000, 9999999) for i in range(random.randint(3, 7))]
940            secret_combo = [''.join(random.choice(string.lowercase) for i in range(7)),
941                            random.randint(1000000, 9999999)]
942            travel_history = [random.choice(COUNTRIES[:9]) for i in range(1, 11)]
943            travel_history_code = [COUNTRY_CODE[COUNTRIES.index(i)] for i in travel_history]
944            travel_details = [{"country": travel_history[i], "code": travel_history_code[i]}
945                              for i in range(len(travel_history))]
946            countries_visited = list(set(travel_history))
947            booking = {"source": random.choice(COUNTRIES), "destination": random.choice(COUNTRIES)}
948            confirm_question_values = [random.choice(bool_vals) for i in range(5)]
949            prefix = "airline_record_" + str(random.random()*100000)
950            generators.append(DocumentGenerator(prefix, template, [name], [email], [age], [premium_customer],
951                                                [address], [travel_history], [travel_history_code], [travel_details],
952                                                [booking], [credit_cards], [secret_combo], [countries_visited],
953                                                [confirm_question_values], start=start, end=1))
954        return generators
955
956    def generate_doc_for_aggregate_pushdown(self, start=0, docs_per_day=10):
957        generators = []
958        bool_vals = [True, False]
959        template = r'{{ "name":"{0}", "business_name": "{1}", "age":{2}, "weight": {3}, "debt":{4}, "passwords":{5}, ' \
960                   r'"transactions":{6}, "address":{7}, "travel_history":{8}, "credit_cards":{9}}}'
961        for i in range(docs_per_day):
962            name = random.choice(FIRST_NAMES)
963            last_name = random.choice(LAST_NAMES)
964            business_name = name + "_" + last_name
965            age = random.randint(25, 70)
966            weight = round(random.uniform(50.1, 115.5), 2)
967            debt = random.randint(-999999, -100000)
968            passwords = []
969            for i in range(random.randint(10, 50)):
970                passwords.append(random.randint(-10000, 99999))
971                passwords.append(''.join(random.choice(string.ascii_lowercase) for _ in range(9)))
972            transactions = [random.randint(1000, 9999999) for _ in range(random.randint(10, 50))]
973            address = {}
974            address["country"] = random.choice(COUNTRIES)
975            address["postal_code"] = random.randint(200000, 800000)
976            travel_history = [random.choice(COUNTRIES[:9]) for i in range(1, 11)]
977            credit_cards = [random.randint(-1000000, 9999999) for i in range(random.randint(3, 7))]
978            prefix = "bank_record_" + str(random.random()*100000)
979            generators.append(DocumentGenerator(prefix, template, [name], [business_name], [age], [weight], [debt],
980                                                [passwords], [transactions], [address], [travel_history],
981                                                [credit_cards], start=start, end=1))
982        return generators
983
984    def generate_docs_employee_data(self, key_prefix ="employee_dataset", start=0, docs_per_day = 1, isShuffle = False):
985        generators = []
986        count = 1
987        sys_admin_info = {"title" : "System Administrator and heliport manager",
988                              "desc" : "...Last but not least, as the heliport manager, you will help maintain our growing fleet of remote controlled helicopters, that crash often due to inexperienced pilots.  As an independent thinker, you may be free to replace some of the technologies we currently use with ones you feel are better. If so, you should be prepared to discuss and debate the pros and cons of suggested technologies with other stakeholders",
989                              "type" : "admin"}
990        ui_eng_info = {"title" : "UI Engineer",
991                           "desc" : "Couchbase server UI is one of the crown jewels of our product, which makes the Couchbase NoSQL database easy to use and operate, reports statistics on real time across large clusters, and much more. As a Member of Couchbase Technical Staff, you will design and implement front-end software for cutting-edge distributed, scale-out data infrastructure software systems, which is a pillar for the growing cloud infrastructure.",
992                            "type" : "ui"}
993        senior_arch_info = {"title" : "Senior Architect",
994                               "desc" : "As a Member of Technical Staff, Senior Architect, you will design and implement cutting-edge distributed, scale-out data infrastructure software systems, which is a pillar for the growing cloud infrastructure. More specifically, you will bring Unix systems and server tech kung-fu to the team.",
995                               "type" : "arch"}
996        data_sets = self._shuffle([sys_admin_info, ui_eng_info, senior_arch_info],isShuffle)
997        if end is None:
998            end = self.docs_per_day
999        join_yr = self._shuffle(range(2008, 2008 + self.years),isShuffle)
1000        join_mo = self._shuffle(range(1, self.months + 1),isShuffle)
1001        join_day = self._shuffle(range(1, self.days + 1),isShuffle)
1002        name = ["employee-%s-%s" % (key_prefix, str(i)) for i in xrange(start, end)]
1003        email = ["%s-mail@couchbase.com" % str(i) for i in xrange(start, end)]
1004        template = '{{ "name":"{0}", "join_yr":{1}, "join_mo":{2}, "join_day":{3},'
1005        template += ' "email":"{4}", "job_title":"{5}", "type":"{6}", "desc":"{7}"}}'
1006        for info in data_sets:
1007            for year in join_yr:
1008                for month in join_mo:
1009                    for day in join_day:
1010                        random.seed(count)
1011                        prefix = str(random.random()*100000)
1012                        generators.append(DocumentGenerator(key_prefix + prefix,
1013                                               template,
1014                                               name, [year], [month], [day],
1015                                               email, [info["title"]],
1016                                               [info["type"]], [info["desc"]],
1017                                               start=start, end=docs_per_day))
1018        return generators
1019
1020    def generate_docs_using_monster(self,
1021            executatble_path = None, key_prefix=  "", bag_dir = "lib/couchbase_helper/monster/bags",
1022            pod_name = None, num_items = 1, seed = None):
1023        "This method runs monster tool using localhost, creates a map of json based on a pattern"
1024        list = []
1025        command = executatble_path
1026        dest_path = "/tmp/{0}.txt".format(int(random.random()*1000))
1027        if pod_name == None:
1028            return list
1029        else:
1030            pod_path = "lib/couchbase_helper/monster/prod/%s" % pod_name
1031        command += " -bagdir {0}".format(bag_dir)
1032        if seed != None:
1033            command += " -s {0}".format(seed)
1034        command += " -n {0}".format(num_items)
1035        command += " -o {0}".format(dest_path)
1036        if pod_path != None:
1037            command += " {0}".format(pod_path)
1038        print "Will run the following command: {0}".format(command)
1039        # run command and generate temp file
1040        os.system(command)
1041        # read file and generate list
1042        with open(dest_path) as f:
1043            i= 1
1044            for line in f.readlines():
1045                key = "{0}{1}".format(key_prefix,i)
1046                data = json.loads(line[:len(line)-1])
1047                data["_id"] = key
1048                data["mutate"] = 0
1049                list.append(data)
1050                i+=1
1051        os.remove(dest_path)
1052        return list
1053
1054    def _shuffle(self, data, isShuffle):
1055        if isShuffle:
1056            if not isinstance(data, list):
1057                data = [x for x in data]
1058            random.shuffle(data)
1059            return data
1060        return data
1061