summaryrefslogtreecommitdiffstats
path: root/stats/stats-bugzilla.py
blob: 1bb65d90aaf76a572167e0e6043de40c1a6c7f3d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Query Bugzilla and retrieve statistics about user contribution (number of
reports).

@author Kamil Páral <kparal@redhat.com>, 2013
@licence GNU AGPL 3+ <http://www.gnu.org/licenses/agpl-3.0.html>
"""

import bugzilla
import logging
import optparse
import sys
import codecs
import locale
import cgi
import re

# empty global variables
RELEASE = MILESTONE = FROM = TO = None
blocker = None
blocker_alias = None # we have to store it separately, because blocker.alias
                     # might also contain other aliases and it's hard to parse
options = None
bugs = None
results = {}
skipped_reporters = 0 # number of reporters below the threshold
skipped_reports = 0 # number of uncounted bugs because of skipped reporters

bz_query_base = None # the core part of bugzilla query URL
bz_query_all = None # bugzilla query URL listing all bugs (without blacklisted
                    # reporters)

blacklisted_reporters = ['abrt-bot@fedoraproject.org',
                         'upstream-release-monitoring@fedoraproject.org']

# init basic variables
bz = bugzilla.RHBugzilla4(url='https://bugzilla.redhat.com/xmlrpc.cgi')

logging.basicConfig(format='%(levelname)s:%(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Wrap sys.stdout into a StreamWriter to allow writing unicode.
# http://stackoverflow.com/questions/4545661/unicodedecodeerror-when-redirecting-to-file
sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)


class UserStats(object):
    '''Store results specific to a particular user'''
    def __init__(self):
        self.display_name = None
        self.reports = 0
        self.excess = 0
        self.accepted_blockers = 0


def parse_args():
    usage = 'usage: %prog [options] RELEASE MILESTONE FROM TO'
    description = '''\
Query Bugzilla and retrieve statistics about user contribution (number of \
reports). Generate the output as an HTML file (ugly code, but suitable for \
blog embedding) printed to standard output.

Arguments:
  RELEASE - the Fedora release number to query for, e.g. "18"
  MILESTONE - milestone of the relevant blocker bug: "Alpha/Beta/Final"
  FROM - start date from when to gather statistics (in YYYY-MM-DD format)
  TO - end date to which to gather statistics (in YYYY-MM-DD format)
'''

    # don't strip newlines from description
    optparse.OptionParser.format_description = lambda self, formatter: self.description
    parser = optparse.OptionParser(usage=usage, description=description)
    parser.add_option('-t', '--threshold', type='int', default=1,
        help=("Only reporters with at least THRESHOLD reports will be included "
              "in the statistics [default: %default]"))

    global options
    options, args = parser.parse_args()

    # sanity check
    if len(args) != 4:
        parser.error('Invalid number of arguments. See --help.')

    global RELEASE, MILESTONE, FROM, TO
    RELEASE = args[0]
    MILESTONE = args[1]
    FROM = args[2]
    TO = args[3]


def init():
    '''Initialize important variables. To be called after
    @method parse_args().'''

    global bz_query_base
    bz_query_base = 'https://bugzilla.redhat.com/buglist.cgi?classification=Fedora&' \
      'product=Fedora&version=%(version)s&query_format=advanced&limit=0&' \
      'chfield=[Bug creation]&chfieldfrom=%(from)s&chfieldto=%(to)s' % {
        'version': RELEASE, 'from': FROM, 'to': TO}

    blacklists = ''
    for (num, email) in enumerate(blacklisted_reporters, 1):
        blacklists += '&email%(num)s=%(email)s&emailreporter%(num)s=1&' \
          'emailtype%(num)s=notequals' % {'num': num, 'email': email}
    global bz_query_all
    bz_query_all = bz_query_base + blacklists


def get_blocker():
    '''Get relevant blocker bug and return it as an BZ object'''

    global blocker_alias
    blocker_alias = 'F%s%sBlocker' % (RELEASE, MILESTONE.capitalize())

    logger.debug('Searching for bug with alias %s...' % blocker_alias)

    blockers = bz.query({'alias': blocker_alias})
    assert len(blockers) == 1

    logger.debug('Done, %s ID is %s' % (blocker_alias, blockers[0].id))

    return blockers[0]


def query_bz():
    '''Query bugzilla for bugs reported in a certain period and fill the
    results variable'''

    # build query
    query = bz.url_to_query(bz_query_all)
    query['include_fields'] = ['id','creator','status','resolution',
                               'whiteboard', 'blocks']

    # query
    global bugs
    logger.debug('Searching for bugs in the requested time period, this will '
                 'take a while...')
    bugs = bz.query(query)

    logger.debug('Done, found %s matching bugs' % len(bugs))

    # populate results
    for bug in bugs:

        creator = bug.creator
        if not creator in results:
            results[creator] = UserStats()

        results[creator].reports += 1

        if bug.status == 'CLOSED' and bug.resolution in ['NOTABUG', 'WONTFIX',
            'WORKSFORME', 'CANTFIX', 'INSUFFICIENT_DATA']:
            results[creator].excess += 1

        if blocker.id in bug.blocks:
            if 'AcceptedBlocker' in bug.whiteboard:
                results[creator].accepted_blockers += 1

    logger.debug('There are %d reporters in total' % len(results))

    # remove reporters below the threshold
    global skipped_reporters, skipped_reports
    for creator in results.keys():
        if results[creator].reports < options.threshold:
            skipped_reporters += 1
            skipped_reports += results[creator].reports
            del results[creator]
    logger.debug('Removed %d reporters (%d reports), because they were below '
        'the threshold. %d reporters (%d reports) were kept in the statistics.'
        % (skipped_reporters, skipped_reports, len(results),
           len(bugs) - skipped_reports))

    if not results:
        logger.error('No reporters matching criteria found! Aborting.')
        sys.exit(1)


def query_users():
    '''
    Get real names for user logins.

    If they don't have a real name, at least protext their email address
    against spam.
    '''

    logger.debug('Getting real user names for retrieved Bugzilla accounts...')

    logins = results.keys()
    users = bz.getusers(logins)
    assert len(logins) == len(users)

    # the returned user list is not ordered the same as our login list
    # https://bugzilla.redhat.com/show_bug.cgi?id=906781

    for user in users:
        assert user.email in results, \
          'this email is not in the login list: %s' % user.email

        name = user.real_name or user.email
        # names can contain unicode strings
        if not isinstance(name, unicode):
            name = unicode(name, 'UTF-8')
        # both real names and login names can be email address, mangle them
        name = name.replace('@', ' at ')
        results[user.email].display_name = name

    logger.debug('Done')


def print_results():
    '''Print results in an HTML format'''

    template = '''\
<!DOCTYPE HTML>
<html>
<head>
  <meta charset="UTF-8">
  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
</head>
<body>
  <p>Test period: <b>Fedora %(release)s [FIXME] - Fedora %(release)s [FIXME]</b> \
                  (%(from)s - %(to)s)<br/> \
     Reporters: <b>%(all_reporters)d</b><br/> \
     New reports: <b>%(all_reports)s</b> \
  </p>
  <table>
    <tr style="text-align:center">
      <th style="vertical-align:middle">Name</th>
      <th style="vertical-align:middle">Reports submitted<a style="text-decoration:none" \
          href="#reports"><sup>1</sup></a></th>
      <th style="vertical-align:middle">Excess reports<a style="text-decoration:none" \
          href="#excess"><sup>2</sup></a></th>
      <th style="vertical-align:middle">Accepted blockers<a style="text-decoration:none" \
          href="#accepted"><sup>3</sup></a></th>
    </tr>
    %(lines)s
    <tr id="threshold">
      <td colspan="4"><i>...and also %(skipped_reporters)d other reporters who \
      created less than %(threshold)d reports each, but %(skipped_reports)d \
      reports combined!</i></td>
    </tr>
  </table>
  <p style="color:#808080;font-size:smaller"> \
  <sup id="reports">1</sup> The total number of <i>new</i> reports (including \
  "excess reports"). Reopened reports or reports with a changed version are not \
  included, because it was not technically easy to retrieve those. This is one \
  of the reasons why you shouldn't take the numbers too seriously, but just as \
  interesting and fun data.<br/> \
  <sup id="excess">2</sup> Excess reports are those that were closed as \
  NOTABUG, WONTFIX, WORKSFORME, CANTFIX or INSUFFICIENT_DATA. \
  Excess reports are not necessarily a bad thing, but they make for interesting \
  statistics. Close manual inspection is required to separate valuable excess \
  reports from those which are less valuable.<br/> \
  <sup id="proposed">3</sup> This only includes reports that were created by \
  that particular user and accepted as blockers afterwards. The user might have \
  proposed other people's reports as blockers, but this is not reflected in \
  this number. \
  </p>
</body>
</html>
'''

    person_reports_url = bz_query_base + '&emailreporter1=1&' \
      'emailtype1=exact&email1=%(login)s'
    excess_url_suffix = '&resolution=NOTABUG&resolution=WONTFIX&' \
      'resolution=WORKSFORME&resolution=CANTFIX&resolution=INSUFFICIENT_DATA'
    accepted_url_suffix = '&f1=blocked&o1=substring&v1=%s&' \
      '&f2=status_whiteboard&o2=substring&v2=AcceptedBlocker' % blocker_alias

    # sort according to score
    def sort_reporters(x, y):
        ret = (cmp(results[x].reports, results[y].reports) or
               cmp(results[x].accepted_blockers,
                   results[y].accepted_blockers) or
               cmp(results[x].display_name.lower(),
                   results[y].display_name.lower()) * -1)
               # swap display_name result, because we want 9->1, but a->z
        return ret

    ladder = results.keys()
    ladder.sort(cmp=sort_reporters, reverse=True)

    # format results
    lines = []
    for login in ladder:
        stats = results[login]
        tr = '<tr>'

        # name
        tr += '<td>%s</td>' % stats.display_name
        # reports
        base_url = person_reports_url % {'login': login}
        base_url = cgi.escape(base_url, True)
        tr += '<td align="center"><a href="%s">%d</a></td>' % (base_url,
                                                               stats.reports)
        # excess
        excess_url = base_url + excess_url_suffix
        if stats.excess > 0:
            percentage = stats.excess * 100 / stats.reports
        else:
            percentage = 0
        percentage = ' <span style="font-size:smaller">(%d%%)</span>' % percentage
        tr += '<td align="center"><a href="%s">%d</a>%s</td>' % (
            excess_url, stats.excess, percentage)

        # accepted
        accepted_url = base_url + accepted_url_suffix
        number = stats.accepted_blockers
        if number > 0:
            number = '<span style="color:red">%d</span>' % number
        tr += '<td align="center"><a href="%s">%s</a></td>' % (accepted_url,
                                                               number)

        tr += '</tr>'
        lines.append(tr)

    # create the template
    lines = '\n'.join(lines)

    # format the template
    output = template % {'lines': lines,
                         'skipped_reporters': skipped_reporters,
                         'threshold': options.threshold,
                         'skipped_reports': skipped_reports,
                         'all_reporters': len(results) + skipped_reporters,
                         'all_reports': '<a href="%s">%d</a>' % (bz_query_all,
                                                                 len(bugs)),
                         'from': FROM,
                         'to': TO,
                         'release': RELEASE,
                        }

    # remove unnecessary lines
    if options.threshold <= 1:
        #thresh_line = re.compile('<tr id="threshold">.*?</tr>', re.DOTALL)
        output = re.sub('<tr id="threshold">.*?</tr>', '', output, flags=re.DOTALL)
        #lines = [line for line in lines if 'id="threshold"' not in line]

    # print
    logger.debug('Printing stats...')
    print output

    # print users that might not be Red Hatters, that helps to direct thanks
    # to the community members
    logger.debug("List of contributors that *might* not be Red Hatters (don't "
        "have a @redhat.com email address): %s" % ', '.join(
            [results[login].display_name for login in ladder
            if '@redhat.com' not in login]))


def main():
    parse_args()
    init()

    global blocker
    blocker = get_blocker()

    query_bz()
    query_users()

    print_results()

    logger.debug('Stats generation complete')


if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.exit(1)