Example of working with denormalized Wikipedia dumps¶

The Mediawiki history dumps contain a wealth of information about all the actions taken on wikis. They can be quite large as a result, but they allow for very detailed analyses of edit behavior / page creation / etc. on wikis without needing to join in lots of other datasets.

Below is an example of how to access this dataset on PAWS. While the data for most wikis is likely too large to process via PAWS, this is a good way to test out analyses without having to download the data locally and analyze smaller wikis.

In [1]:
import bz2
import csv
import os
import time

import pandas as pd
In [2]:
DUMP_DIR = "/public/dumps/public/other/mediawiki_history"
DATE = "2022-01"
WIKI = "hrwiki"
# for schema and how to interpret fields, see: 
# https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history_dumps#Technical_Documentation
DATA_SCHEMA = ["wiki_db",
               "event_entity", "event_type", "event_timestamp", "event_comment_escaped",
               "event_user_id", "event_user_text_historical_escaped", "event_user_text_escaped",
               "event_user_blocks_historical_string", "event_user_blocks_string",
               "event_user_groups_historical_string", "event_user_groups_string",
               "event_user_is_bot_by_historical_string", "event_user_is_bot_by_string",
               "event_user_is_created_by_self", "event_user_is_created_by_system", "event_user_is_created_by_peer",
               "event_user_is_anonymous",
               "event_user_registration_timestamp", "event_user_creation_timestamp",
               "event_user_first_edit_timestamp", "event_user_revision_count", "event_user_seconds_since_previous_revision",
               "page_id", "page_title_historical_escaped", "page_title_escaped",
               "page_namespace_historical", "page_namespace_is_content_historical", "page_namespace", "page_namespace_is_content",
               "page_is_redirect", "page_is_deleted",
               "page_creation_timestamp", "page_first_edit_timestamp", "page_revision_count", "page_seconds_since_previous_revision",
               "user_id", "user_text_historical_escaped", "user_text_escaped",
               "user_blocks_historical_string", "user_blocks_string", "user_groups_historical_string", "user_groups_string",
               "user_is_bot_by_historical_string", "user_is_bot_by_string",
               "user_is_created_by_self", "user_is_created_by_system", "user_is_created_by_peer", "user_is_anonymous",
               "user_registration_timestamp", "user_creation_timestamp", "user_first_edit_timestamp",
               "revision_id", "revision_parent_id", "revision_minor_edit", "revision_deleted_parts_string",
               "revision_deleted_parts_are_suppressed", "revision_text_bytes", "revision_text_bytes_diff",
               "revision_text_sha1", "revision_content_model", "revision_content_format",
               "revision_is_deleted_by_page_deletion", "revision_deleted_by_page_deletion_timestamp",
               "revision_is_identity_reverted", "revision_first_identity_reverting_revision_id",
               "revision_seconds_to_identity_revert", "revision_is_identity_revert", "revision_is_from_before_page_creation",
               "revision_tags_string"]
In [3]:
# about 400MB of data compressed so might take a while to parse
!ls -shH {DUMP_DIR}/{DATE}/{WIKI}
total 484M
484M 2022-01.hrwiki.all-time.tsv.bz2

Simple analysis of who creates what pages¶

In [6]:
event_entity_idx = DATA_SCHEMA.index('event_entity')
event_type_idx = DATA_SCHEMA.index('event_type')
curr_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_historical_string')
hist_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_string')
ip_editor_idx = DATA_SCHEMA.index('event_user_is_anonymous')
page_ns_idx = DATA_SCHEMA.index('page_namespace')
page_id_idx = DATA_SCHEMA.index('page_id')
redirect_idx = DATA_SCHEMA.index('page_is_redirect')

wiki_dir = os.path.join(DUMP_DIR, DATE, WIKI)
history_fns = sorted([fn for fn in os.listdir(wiki_dir) if fn.endswith('.tsv.bz2')])
data = []
for fn in history_fns:
    print(fn)
    with bz2.open(os.path.join(wiki_dir, fn), 'rt') as fin:
        for line in fin:
            line = line.strip().split('\t')
            # Filter to just page creations
            # See: https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history_dumps#Important_fields:_event_entity_and_event_type
            if line[event_entity_idx] == 'page' and line[event_type_idx] == 'create':
                # determine user type
                user_type = 'user'
                if line[ip_editor_idx] == 'true':
                    user_type = 'ip_editor'
                elif line[curr_bot_idx] or line[hist_bot_idx]:
                    user_type = 'bot'
                # verify valid page info
                try:
                    ns = int(line[page_ns_idx])
                    is_redirect = line[redirect_idx] == 'true'
                    if ns == 0 and not is_redirect:
                        article_type = 'article'
                    elif is_redirect:
                        article_type = 'redirect'
                    elif ns % 2 == 1:
                        article_type = 'talk'
                    else:
                        article_type = 'non-article'
                    page_id = int(line[page_id_idx])
                    if page_id == 0:
                        continue
                except ValueError:
                    continue  # happens occasionally but rarely enough that can be generally skipped
                    
                data.append((article_type, user_type))
2022-01.hrwiki.all-time.tsv.bz2
In [7]:
df = pd.DataFrame(data, columns=['article_type', 'user_type'])
In [8]:
df.groupby('article_type')['user_type'].value_counts(dropna=False)
Out[8]:
article_type  user_type
article       user         213288
              ip_editor     27538
              bot           17090
non-article   user         117456
              bot            2151
              ip_editor      1986
redirect      user          70169
              bot            3126
              ip_editor      1720
talk          bot          139223
              user          57847
              ip_editor      5612
Name: user_type, dtype: int64
In [9]:
df.groupby('article_type')['user_type'].value_counts(dropna=False, normalize=True)
Out[9]:
article_type  user_type
article       user         0.826967
              ip_editor    0.106771
              bot          0.066262
non-article   user         0.965977
              bot          0.017690
              ip_editor    0.016333
redirect      user         0.935400
              bot          0.041672
              ip_editor    0.022929
talk          bot          0.686904
              user         0.285408
              ip_editor    0.027689
Name: user_type, dtype: float64
In [10]:
df.groupby('user_type')['article_type'].value_counts()
Out[10]:
user_type  article_type
bot        talk            139223
           article          17090
           redirect          3126
           non-article       2151
ip_editor  article          27538
           talk              5612
           non-article       1986
           redirect          1720
user       article         213288
           non-article     117456
           redirect         70169
           talk             57847
Name: article_type, dtype: int64

Do bot-generated pages get edited¶

In [11]:
event_entity_idx = DATA_SCHEMA.index('event_entity')
event_type_idx = DATA_SCHEMA.index('event_type')
curr_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_historical_string')
hist_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_string')
ip_editor_idx = DATA_SCHEMA.index('event_user_is_anonymous')
page_ns_idx = DATA_SCHEMA.index('page_namespace')
page_id_idx = DATA_SCHEMA.index('page_id')
redirect_idx = DATA_SCHEMA.index('page_is_redirect')

wiki_dir = os.path.join(DUMP_DIR, DATE, WIKI)
history_fns = sorted([fn for fn in os.listdir(wiki_dir) if fn.endswith('.tsv.bz2')])
pages = {}
start_time = time.time()
for fn in history_fns:
    print(f'{fn} ({(time.time() - start_time) / 60:.1f} minutes elapsed)')
    with bz2.open(os.path.join(wiki_dir, fn), 'rt') as fin:        
        for line in fin:
            line = line.strip().split('\t')
            # determine user type
            user_type = 'user'
            if line[ip_editor_idx] == 'true':
                user_type = 'ip_editor'
            elif line[curr_bot_idx] or line[hist_bot_idx]:
                user_type = 'bot'
                
            try:
                ns = int(line[page_ns_idx])
                is_redirect = line[redirect_idx] == 'true'
                if ns == 0 and not is_redirect:
                    article_type = 'article'
                elif is_redirect:
                    article_type = 'redirect'
                elif ns % 2 == 1:
                    article_type = 'talk'
                else:
                    article_type = 'non-article'
                page_id = int(line[page_id_idx])
                if page_id == 0:
                    continue
            except ValueError:
                continue  # happens occasionally but rarely enough that can be generally skipped

            # Identify-bot-created articles
            if line[event_entity_idx] == 'page' and line[event_type_idx] == 'create':  
                if user_type == 'bot':
                    if article_type not in pages:
                        pages[article_type] = {}
                    # set to False -- not edited by non-bots
                    pages[article_type][page_id] = False
            # edits
            elif line[event_entity_idx] == 'revision' and line[event_type_idx] == 'create':
                # human edit to bot-created page
                if user_type != 'bot' and page_id in pages.get(article_type, {}):
                    pages[article_type][page_id] = True
print(f"Processing {WIKI} took {(time.time() - start_time) / 60:.1f} minutes.")
2022-01.hrwiki.all-time.tsv.bz2 (0.0 minutes elapsed)
Processing hrwiki took 2.8 minutes.
In [16]:
# What percentage of bot-created pages go on to be edits by users/IP-editors?
ap = df.groupby('article_type')['user_type'].value_counts(dropna=False, normalize=True)
for article_type in pages:
    print(f'{article_type}: {len(pages[article_type])} pages created by bots ({100 * ap[article_type]["bot"]:.1f}% of total); {100 * sum(pages[article_type].values()) / len(pages[article_type]):.1f}% of them edited by non-bots')
non-article: 2151 pages created by bots (1.8% of total); 38.5% of them edited by non-bots
talk: 139223 pages created by bots (68.7% of total); 9.5% of them edited by non-bots
article: 17090 pages created by bots (6.6% of total); 42.4% of them edited by non-bots
redirect: 3126 pages created by bots (4.2% of total); 1.8% of them edited by non-bots