Example of working with denormalized Wikipedia dumps¶
The Mediawiki history dumps contain a wealth of information about all the actions taken on wikis. They can be quite large as a result, but they allow for very detailed analyses of edit behavior / page creation / etc. on wikis without needing to join in lots of other datasets.
Below is an example of how to access this dataset on PAWS. While the data for most wikis is likely too large to process via PAWS, this is a good way to test out analyses without having to download the data locally and analyze smaller wikis.
In [1]:
import bz2
import csv
import os
import time
import pandas as pd
In [2]:
DUMP_DIR = "/public/dumps/public/other/mediawiki_history"
DATE = "2022-01"
WIKI = "hrwiki"
# for schema and how to interpret fields, see:
# https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history_dumps#Technical_Documentation
DATA_SCHEMA = ["wiki_db",
"event_entity", "event_type", "event_timestamp", "event_comment_escaped",
"event_user_id", "event_user_text_historical_escaped", "event_user_text_escaped",
"event_user_blocks_historical_string", "event_user_blocks_string",
"event_user_groups_historical_string", "event_user_groups_string",
"event_user_is_bot_by_historical_string", "event_user_is_bot_by_string",
"event_user_is_created_by_self", "event_user_is_created_by_system", "event_user_is_created_by_peer",
"event_user_is_anonymous",
"event_user_registration_timestamp", "event_user_creation_timestamp",
"event_user_first_edit_timestamp", "event_user_revision_count", "event_user_seconds_since_previous_revision",
"page_id", "page_title_historical_escaped", "page_title_escaped",
"page_namespace_historical", "page_namespace_is_content_historical", "page_namespace", "page_namespace_is_content",
"page_is_redirect", "page_is_deleted",
"page_creation_timestamp", "page_first_edit_timestamp", "page_revision_count", "page_seconds_since_previous_revision",
"user_id", "user_text_historical_escaped", "user_text_escaped",
"user_blocks_historical_string", "user_blocks_string", "user_groups_historical_string", "user_groups_string",
"user_is_bot_by_historical_string", "user_is_bot_by_string",
"user_is_created_by_self", "user_is_created_by_system", "user_is_created_by_peer", "user_is_anonymous",
"user_registration_timestamp", "user_creation_timestamp", "user_first_edit_timestamp",
"revision_id", "revision_parent_id", "revision_minor_edit", "revision_deleted_parts_string",
"revision_deleted_parts_are_suppressed", "revision_text_bytes", "revision_text_bytes_diff",
"revision_text_sha1", "revision_content_model", "revision_content_format",
"revision_is_deleted_by_page_deletion", "revision_deleted_by_page_deletion_timestamp",
"revision_is_identity_reverted", "revision_first_identity_reverting_revision_id",
"revision_seconds_to_identity_revert", "revision_is_identity_revert", "revision_is_from_before_page_creation",
"revision_tags_string"]
In [3]:
# about 400MB of data compressed so might take a while to parse
!ls -shH {DUMP_DIR}/{DATE}/{WIKI}
total 484M 484M 2022-01.hrwiki.all-time.tsv.bz2
Simple analysis of who creates what pages¶
In [6]:
event_entity_idx = DATA_SCHEMA.index('event_entity')
event_type_idx = DATA_SCHEMA.index('event_type')
curr_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_historical_string')
hist_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_string')
ip_editor_idx = DATA_SCHEMA.index('event_user_is_anonymous')
page_ns_idx = DATA_SCHEMA.index('page_namespace')
page_id_idx = DATA_SCHEMA.index('page_id')
redirect_idx = DATA_SCHEMA.index('page_is_redirect')
wiki_dir = os.path.join(DUMP_DIR, DATE, WIKI)
history_fns = sorted([fn for fn in os.listdir(wiki_dir) if fn.endswith('.tsv.bz2')])
data = []
for fn in history_fns:
print(fn)
with bz2.open(os.path.join(wiki_dir, fn), 'rt') as fin:
for line in fin:
line = line.strip().split('\t')
# Filter to just page creations
# See: https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_history_dumps#Important_fields:_event_entity_and_event_type
if line[event_entity_idx] == 'page' and line[event_type_idx] == 'create':
# determine user type
user_type = 'user'
if line[ip_editor_idx] == 'true':
user_type = 'ip_editor'
elif line[curr_bot_idx] or line[hist_bot_idx]:
user_type = 'bot'
# verify valid page info
try:
ns = int(line[page_ns_idx])
is_redirect = line[redirect_idx] == 'true'
if ns == 0 and not is_redirect:
article_type = 'article'
elif is_redirect:
article_type = 'redirect'
elif ns % 2 == 1:
article_type = 'talk'
else:
article_type = 'non-article'
page_id = int(line[page_id_idx])
if page_id == 0:
continue
except ValueError:
continue # happens occasionally but rarely enough that can be generally skipped
data.append((article_type, user_type))
2022-01.hrwiki.all-time.tsv.bz2
In [7]:
df = pd.DataFrame(data, columns=['article_type', 'user_type'])
In [8]:
df.groupby('article_type')['user_type'].value_counts(dropna=False)
Out[8]:
article_type user_type
article user 213288
ip_editor 27538
bot 17090
non-article user 117456
bot 2151
ip_editor 1986
redirect user 70169
bot 3126
ip_editor 1720
talk bot 139223
user 57847
ip_editor 5612
Name: user_type, dtype: int64
In [9]:
df.groupby('article_type')['user_type'].value_counts(dropna=False, normalize=True)
Out[9]:
article_type user_type
article user 0.826967
ip_editor 0.106771
bot 0.066262
non-article user 0.965977
bot 0.017690
ip_editor 0.016333
redirect user 0.935400
bot 0.041672
ip_editor 0.022929
talk bot 0.686904
user 0.285408
ip_editor 0.027689
Name: user_type, dtype: float64
In [10]:
df.groupby('user_type')['article_type'].value_counts()
Out[10]:
user_type article_type
bot talk 139223
article 17090
redirect 3126
non-article 2151
ip_editor article 27538
talk 5612
non-article 1986
redirect 1720
user article 213288
non-article 117456
redirect 70169
talk 57847
Name: article_type, dtype: int64
Do bot-generated pages get edited¶
In [11]:
event_entity_idx = DATA_SCHEMA.index('event_entity')
event_type_idx = DATA_SCHEMA.index('event_type')
curr_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_historical_string')
hist_bot_idx = DATA_SCHEMA.index('event_user_is_bot_by_string')
ip_editor_idx = DATA_SCHEMA.index('event_user_is_anonymous')
page_ns_idx = DATA_SCHEMA.index('page_namespace')
page_id_idx = DATA_SCHEMA.index('page_id')
redirect_idx = DATA_SCHEMA.index('page_is_redirect')
wiki_dir = os.path.join(DUMP_DIR, DATE, WIKI)
history_fns = sorted([fn for fn in os.listdir(wiki_dir) if fn.endswith('.tsv.bz2')])
pages = {}
start_time = time.time()
for fn in history_fns:
print(f'{fn} ({(time.time() - start_time) / 60:.1f} minutes elapsed)')
with bz2.open(os.path.join(wiki_dir, fn), 'rt') as fin:
for line in fin:
line = line.strip().split('\t')
# determine user type
user_type = 'user'
if line[ip_editor_idx] == 'true':
user_type = 'ip_editor'
elif line[curr_bot_idx] or line[hist_bot_idx]:
user_type = 'bot'
try:
ns = int(line[page_ns_idx])
is_redirect = line[redirect_idx] == 'true'
if ns == 0 and not is_redirect:
article_type = 'article'
elif is_redirect:
article_type = 'redirect'
elif ns % 2 == 1:
article_type = 'talk'
else:
article_type = 'non-article'
page_id = int(line[page_id_idx])
if page_id == 0:
continue
except ValueError:
continue # happens occasionally but rarely enough that can be generally skipped
# Identify-bot-created articles
if line[event_entity_idx] == 'page' and line[event_type_idx] == 'create':
if user_type == 'bot':
if article_type not in pages:
pages[article_type] = {}
# set to False -- not edited by non-bots
pages[article_type][page_id] = False
# edits
elif line[event_entity_idx] == 'revision' and line[event_type_idx] == 'create':
# human edit to bot-created page
if user_type != 'bot' and page_id in pages.get(article_type, {}):
pages[article_type][page_id] = True
print(f"Processing {WIKI} took {(time.time() - start_time) / 60:.1f} minutes.")
2022-01.hrwiki.all-time.tsv.bz2 (0.0 minutes elapsed) Processing hrwiki took 2.8 minutes.
In [16]:
# What percentage of bot-created pages go on to be edits by users/IP-editors?
ap = df.groupby('article_type')['user_type'].value_counts(dropna=False, normalize=True)
for article_type in pages:
print(f'{article_type}: {len(pages[article_type])} pages created by bots ({100 * ap[article_type]["bot"]:.1f}% of total); {100 * sum(pages[article_type].values()) / len(pages[article_type]):.1f}% of them edited by non-bots')
non-article: 2151 pages created by bots (1.8% of total); 38.5% of them edited by non-bots talk: 139223 pages created by bots (68.7% of total); 9.5% of them edited by non-bots article: 17090 pages created by bots (6.6% of total); 42.4% of them edited by non-bots redirect: 3126 pages created by bots (4.2% of total); 1.8% of them edited by non-bots