From 175050e8e9a84fa1e8408653337aff1feaea4019 Mon Sep 17 00:00:00 2001 From: hwankook <36428250+hwankook@users.noreply.github.com> Date: Thu, 30 Apr 2020 02:39:56 -0600 Subject: [PATCH] temporal upload --- ...Team5-TopicBasedSearchEngine-CORD-19.ipynb | 6912 +++++++---------- 1 file changed, 2994 insertions(+), 3918 deletions(-) diff --git a/CS535-Team5-TopicBasedSearchEngine-CORD-19.ipynb b/CS535-Team5-TopicBasedSearchEngine-CORD-19.ipynb index 677849b..0f43235 100644 --- a/CS535-Team5-TopicBasedSearchEngine-CORD-19.ipynb +++ b/CS535-Team5-TopicBasedSearchEngine-CORD-19.ipynb @@ -7,7 +7,7 @@ }, "source": [ "

Table of Contents

\n", - "
" + "
" ] }, { @@ -28,14 +28,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time for imports only: 0:00:01.366413\n" + "Time for imports only: 0:00:00.000997\n" ] } ], @@ -46,50 +46,51 @@ "import os\n", "import json\n", "import glob\n", - "import matplotlib.pyplot as plt\n", - "from hdfs import InsecureClient\n", "end_time = datetime.now()\n", "print(f'Time for imports only: {end_time-start_time}')" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ - "root_path = \"/s/chopin/b/grad/sanketm/cord19dataset/\"" + "root_path = \"D:/CORD-19-research-challenge/\"\n", + "metadata_path = f'{root_path}/metadata.csv'\n", + "biorxiv_medrxiv_path = \"D:/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv\"" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['noncomm_use_subset',\n", - " 'cord19_specter_embeddings_2020-04-10',\n", - " 'custom_license',\n", + "['.ipynb_checkpoints',\n", " 'biorxiv_medrxiv',\n", + " 'comm_use_subset',\n", + " 'cord_19_embeddings_4_17',\n", " 'COVID.DATA.LIC.AGMT.pdf',\n", - " 'metadata.readme',\n", + " 'CS535-Team5-TopicBasedSearchEngine-CORD-19(2).ipynb',\n", + " 'CS535-Team5-TopicBasedSearchEngine-CORD-19.ipynb',\n", + " 'custom_license',\n", " 'json_schema.txt',\n", - " 'comm_use_subset',\n", - " 'biorxiv_medrxiv.tar.gz',\n", " 'metadata.csv',\n", - " 'images']" + " 'metadata.readme',\n", + " 'noncomm_use_subset']" ] }, - "execution_count": 9, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.chdir(root_path)\n", - "os.listdir()\n" + "os.listdir()" ] }, { @@ -110,7 +111,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![title](/s/chopin/b/grad/sanketm/cord19dataset/images/article_format.png)" + "![title](images/article_format.png)" ] }, { @@ -129,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -182,7 +183,7 @@ " Airborne rhinovirus detection and effect of ul...\n", " 10.1186/1471-2458-3-5\n", " PMC140314\n", - " 12525263.0\n", + " 12525263\n", " no-cc\n", " BACKGROUND: Rhinovirus, the most common cause ...\n", " 2003-01-13\n", @@ -203,7 +204,7 @@ " Discovering human history from stomach bacteria\n", " 10.1186/gb-2003-4-5-213\n", " PMC156578\n", - " 12734001.0\n", + " 12734001\n", " no-cc\n", " Recent analyses of human pathogens have reveal...\n", " 2003-04-28\n", @@ -224,7 +225,7 @@ " A new recruit for the army of the men of death\n", " 10.1186/gb-2003-4-7-113\n", " PMC193621\n", - " 12844350.0\n", + " 12844350\n", " no-cc\n", " The army of the men of death, in John Bunyan's...\n", " 2003-06-27\n", @@ -245,7 +246,7 @@ " Association of HLA class I with severe acute r...\n", " 10.1186/1471-2350-4-9\n", " PMC212558\n", - " 12969506.0\n", + " 12969506\n", " no-cc\n", " BACKGROUND: The human leukocyte antigen (HLA) ...\n", " 2003-09-12\n", @@ -266,7 +267,7 @@ " A double epidemic model for the SARS propagation\n", " 10.1186/1471-2334-3-19\n", " PMC222908\n", - " 12964944.0\n", + " 12964944\n", " no-cc\n", " BACKGROUND: An epidemic of a Severe Acute Resp...\n", " 2003-09-10\n", @@ -298,12 +299,12 @@ "3 Association of HLA class I with severe acute r... 10.1186/1471-2350-4-9 \n", "4 A double epidemic model for the SARS propagation 10.1186/1471-2334-3-19 \n", "\n", - " pmcid pubmed_id license \\\n", - "0 PMC140314 12525263.0 no-cc \n", - "1 PMC156578 12734001.0 no-cc \n", - "2 PMC193621 12844350.0 no-cc \n", - "3 PMC212558 12969506.0 no-cc \n", - "4 PMC222908 12964944.0 no-cc \n", + " pmcid pubmed_id license \\\n", + "0 PMC140314 12525263 no-cc \n", + "1 PMC156578 12734001 no-cc \n", + "2 PMC193621 12844350 no-cc \n", + "3 PMC212558 12969506 no-cc \n", + "4 PMC222908 12964944 no-cc \n", "\n", " abstract publish_time \\\n", "0 BACKGROUND: Rhinovirus, the most common cause ... 2003-01-13 \n", @@ -319,12 +320,12 @@ "3 Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean... BMC Med Genet \n", "4 Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine BMC Infect Dis \n", "\n", - " Microsoft Academic Paper ID WHO #Covidence has_pdf_parse \\\n", - "0 NaN NaN True \n", - "1 NaN NaN True \n", - "2 NaN NaN False \n", - "3 NaN NaN True \n", - "4 NaN NaN True \n", + " Microsoft Academic Paper ID WHO #Covidence has_pdf_parse \\\n", + "0 NaN NaN True \n", + "1 NaN NaN True \n", + "2 NaN NaN False \n", + "3 NaN NaN True \n", + "4 NaN NaN True \n", "\n", " has_pmc_xml_parse full_text_file \\\n", "0 True custom_license \n", @@ -341,46 +342,116 @@ "4 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2... " ] }, - "execution_count": 10, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "meta = pd.read_csv(\"./metadata.csv\")\n", + "meta = pd.read_csv(metadata_path, dtype={\n", + " 'pubmed_id': str,\n", + " 'Microsoft Academic Paper ID': str, \n", + " 'doi': str\n", + "})\n", "meta.head()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cord_uidshasource_xtitledoipmcidpubmed_idlicenseabstractpublish_timeauthorsjournalMicrosoft Academic Paper IDWHO #Covidencehas_pdf_parsehas_pmc_xml_parsefull_text_fileurl
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [cord_uid, sha, source_x, title, doi, pmcid, pubmed_id, license, abstract, publish_time, authors, journal, Microsoft Academic Paper ID, WHO #Covidence, has_pdf_parse, has_pmc_xml_parse, full_text_file, url]\n", + "Index: []" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta[meta['abstract'] == '']" + ] + }, + { + "cell_type": "code", + "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "cord_uid 51078\n", - "sha 38022\n", - "source_x 51078\n", - "title 50920\n", - "doi 47741\n", - "pmcid 41082\n", - "pubmed_id 37861\n", - "license 51078\n", - "abstract 42352\n", - "publish_time 51070\n", - "authors 48891\n", - "journal 46368\n", + "cord_uid 52398\n", + "sha 39024\n", + "source_x 52398\n", + "title 52240\n", + "doi 49058\n", + "pmcid 43652\n", + "pubmed_id 38058\n", + "license 52398\n", + "abstract 43168\n", + "publish_time 52390\n", + "authors 50119\n", + "journal 47156\n", "Microsoft Academic Paper ID 964\n", "WHO #Covidence 1768\n", - "has_pdf_parse 51078\n", - "has_pmc_xml_parse 51078\n", - "full_text_file 42511\n", - "url 50776\n", + "has_pdf_parse 52398\n", + "has_pmc_xml_parse 52398\n", + "full_text_file 43794\n", + "url 52096\n", "dtype: int64" ] }, - "execution_count": 11, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -391,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 89, "metadata": {}, "outputs": [ { @@ -399,28 +470,28 @@ "output_type": "stream", "text": [ "\n", - "RangeIndex: 51078 entries, 0 to 51077\n", + "RangeIndex: 52398 entries, 0 to 52397\n", "Data columns (total 18 columns):\n", - "cord_uid 51078 non-null object\n", - "sha 38022 non-null object\n", - "source_x 51078 non-null object\n", - "title 50920 non-null object\n", - "doi 47741 non-null object\n", - "pmcid 41082 non-null object\n", - "pubmed_id 37861 non-null float64\n", - "license 51078 non-null object\n", - "abstract 42352 non-null object\n", - "publish_time 51070 non-null object\n", - "authors 48891 non-null object\n", - "journal 46368 non-null object\n", - "Microsoft Academic Paper ID 964 non-null float64\n", + "cord_uid 52398 non-null object\n", + "sha 39024 non-null object\n", + "source_x 52398 non-null object\n", + "title 52240 non-null object\n", + "doi 49058 non-null object\n", + "pmcid 43652 non-null object\n", + "pubmed_id 38058 non-null object\n", + "license 52398 non-null object\n", + "abstract 43168 non-null object\n", + "publish_time 52390 non-null object\n", + "authors 50119 non-null object\n", + "journal 47156 non-null object\n", + "Microsoft Academic Paper ID 964 non-null object\n", "WHO #Covidence 1768 non-null object\n", - "has_pdf_parse 51078 non-null bool\n", - "has_pmc_xml_parse 51078 non-null bool\n", - "full_text_file 42511 non-null object\n", - "url 50776 non-null object\n", - "dtypes: bool(2), float64(2), object(14)\n", - "memory usage: 6.3+ MB\n" + "has_pdf_parse 52398 non-null bool\n", + "has_pmc_xml_parse 52398 non-null bool\n", + "full_text_file 43794 non-null object\n", + "url 52096 non-null object\n", + "dtypes: bool(2), object(16)\n", + "memory usage: 6.5+ MB\n" ] } ], @@ -439,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 90, "metadata": {}, "outputs": [ { @@ -448,7 +519,7 @@ "'1e1286db212100993d03cc22374b624f7caee956'" ] }, - "execution_count": 13, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -457,28 +528,6 @@ "meta[\"sha\"][0]" ] }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'daf32e013d325a6feb80e83d15aabc64a48fae33'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bio= meta.loc[meta['source_x'] == 'biorxiv']\n", - "bio.head()\n", - "bio.iloc[1]['sha']" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -488,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 91, "metadata": {}, "outputs": [ { @@ -498,7 +547,7 @@ " dtype=object)" ] }, - "execution_count": 15, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } @@ -523,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 92, "metadata": {}, "outputs": [ { @@ -537,7 +586,7 @@ " 'medrxiv': None}" ] }, - "execution_count": 16, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -549,16 +598,16 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ - "file_path['biorxiv'] = \"/s/chopin/b/grad/sanketm/cord19dataset/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/\"\n" + "file_path['biorxiv'] = \"D:/CORD-19-research-challenge/\"" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -574,2115 +623,384 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "article_array = {}\n", "for i in range(5):\n", - " article_array[i] = json.load(open(bio.iloc[i]['sha'] + \".json\"))\n" + " #article_array[i] = json.load(open(meta[\"sha\"][i] + \".json\"))\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(5):\n", + " #print(article_array[i])\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "#article_array[0].keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From above, we can see that there are keys that describe each paper of the biorxiv journal. Of these, Our paritcular interest lies in the abstract and the body_text. The title is useful too, and to get it, we have to look inside the metadata part of the file. " + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "#article_array[0]['metadata'].keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If printed as a whole, each file would look like the below output:" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "#f = open('04dc6d9c1e3f28e2b57934301593cf3da58b9935.json')\n", + "#for line in f:\n", + " #print(line)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to load and read a json file" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "#with open('04dc6d9c1e3f28e2b57934301593cf3da58b9935.json') as f:\n", + " #data = json.load(f)\n", + "\n", + "#print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "#data.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "#for entry in data['abstract']:\n", + " #print(entry['section'],\"\\n\",entry['text'])" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "def load_json(filename):\n", + " with open(filename + \".json\") as f:\n", + " data = json.load(f)\n", + " return data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code to read all json files in the directory in one go (Cite: [Kernel on Kaggle](https://www.kaggle.com/maksimeren/covid-19-literature-clustering))" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 104, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'paper_id': 'f056da9c64fbf00a4645ae326e8a4339d015d155', 'metadata': {'title': 'SIANN: Strain Identification by Alignment to Near Neighbors', 'authors': [{'first': 'Samuel', 'middle': ['S'], 'last': 'Minot', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Stephen', 'middle': ['D'], 'last': 'Turner', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Krista', 'middle': ['L'], 'last': 'Ternus', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Dana', 'middle': ['R'], 'last': 'Kadavy', 'suffix': '', 'affiliation': {}, 'email': ''}]}, 'abstract': [{'text': 'Next-generation sequencing is increasingly being used to study samples composed of mixtures of organisms, such as in clinical applications where the presence of a pathogen at very low abundance may be highly important. We present an analytical method (SIANN: Strain Identification by Alignment to Near Neighbors) specifically designed to rapidly detect a set of target organisms in mixed samples that achieves a high degree of species-and strain-specificity by aligning short sequence reads to the genomes of near neighbor organisms, as well as that of the target. Empirical benchmarking alongside the current state-of-the-art methods shows an extremely high Positive Predictive Value, even at very low abundances of the target organism in a mixed sample. SIANN is available as an Illumina BaseSpace app, as well as through Signature Science, LLC. SIANN results are presented in a streamlined report designed to be comprehensible to the non-specialist user, providing a powerful tool for rapid species detection in a mixed sample. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': 'There are many different methods that characterize the mixture of organisms present within a metagenomic dataset. Such datasets are generated when a complex environmental sample is processed by a \"next-generation\" high-throughput genome sequencing protocol, and they consist of large numbers of short nucleotide sequences. Each sequence represents a small fragment of a randomly selected genome from the very large collection of genomes present in the source sample. Those sequences indicate the presence of one organism or another according to their similarity to a set of known reference genomes. While a given sequence may be unique to one species, it also may be found in diverse organisms across the tree of life. Therefore, one analytical challenge (among many) is to take that collection of sequences (likely numbering in the millions) and accurately determine what species are present in the sample. Here we describe a novel method (SIANN: Strain Identification by Alignment to Near Neighbors) that is specifically designed to rapidly detect a set of targeted organisms from a metagenomic dataset by aligning reads to genomic regions that are unique at the strain or species level.', 'cite_spans': [], 'ref_spans': [], 'section': 'Introduction'}, {'text': \"The analytical question motivating a particular piece of metagenomic bioinformatic analysis may vary widely by user and sample type (Segata, et al., 2013) . For example, the function of the human gut microbiome may depend on the relative abundance of hundreds of species of bacteria and the types of metabolic genes they contain (Wu, et al., 2011; Schloissnig, et al., 2013) . In contrast, the clinical treatment of a patient may depend on whether or not a particular virus, or a consortium of co-infecting pathogens, is/are detected in their blood. It is this second class of presence/absence questions that SIANN is designed to address. SIANN is appropriate for situations in which a user wants to know whether a particular organism or set of organisms is present in a sample, but isn't interested in the functions encoded in their genomes, the relative abundance of each organism, or any other more in-depth analysis.\", 'cite_spans': [{'start': 132, 'end': 154, 'text': '(Segata, et al., 2013)', 'ref_id': None}, {'start': 329, 'end': 347, 'text': '(Wu, et al., 2011;', 'ref_id': None}, {'start': 348, 'end': 374, 'text': 'Schloissnig, et al., 2013)', 'ref_id': 'BIBREF10'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Metagenomic classification methods are based on a wide variety of theoretical underpinnings. The basic varieties include alignment of reads to various nucleotide databases or exact matching to nucleotide or protein signature sequences (or kmers). A representative set of recent methods are described in Table 1 (also see Bazinet & Cummings 2012) .', 'cite_spans': [{'start': 321, 'end': 345, 'text': 'Bazinet & Cummings 2012)', 'ref_id': 'BIBREF1'}], 'ref_spans': [{'start': 303, 'end': 310, 'text': 'Table 1', 'ref_id': None}], 'section': 'Approach'}, {'text': 'Alignment to large nucleotide database Huson, et al., 2011 PhymmBL Alignment to large nucleotide database with interpolated Markov models Brady & Salzberg, 2011 Metaphyler Alignment to clade-specific marker genes Liu, et al., Overall, these methods are designed to either classify individual reads to, and/or predict the total abundance of, clades (e.g. genus or species) across the entire tree of life. They generally require reference databases that are very large and/or require a large amount of processing to generate. The gap SIANN is designed to fill is when the entire tree of life is irrelevant, and only predefined subsets of organisms need to be detected. For an underlying method we chose read alignment to diagnostic genomic regions because the algorithms for read alignment are highly parallelizable and have been optimized heavily by the community at large (the current implementation of SIANN uses bowtie2 [Langmead & Salzberg, 2012] for the alignment function, but can be adapted to any alignment algorithm). This approach is distinct from using cladespecific marker genes (Segata, et al., 2012) because unique regions that are larger, smaller, or outside of genes can also be used. Furthermore, this approach supports the rapid construction of custom databases using reference genome sets that require only minimal user-supplied structure.', 'cite_spans': [{'start': 39, 'end': 66, 'text': 'Huson, et al., 2011 PhymmBL', 'ref_id': None}, {'start': 138, 'end': 160, 'text': 'Brady & Salzberg, 2011', 'ref_id': None}, {'start': 213, 'end': 225, 'text': 'Liu, et al.,', 'ref_id': None}, {'start': 922, 'end': 949, 'text': '[Langmead & Salzberg, 2012]', 'ref_id': 'BIBREF7'}, {'start': 1090, 'end': 1112, 'text': '(Segata, et al., 2012)', 'ref_id': None}], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'To understand the principle at work, consider a set of reads that have been aligned to the genomes of several strains belonging to two species. Some regions of those genomes are species-specific, some are strain-specific, and some are shared ( Figure 1a ). When a set of reads is aligned to those genomes such that each read is placed in as many locations as it has a match (at a reasonably stringent threshold), visual inspection of the distribution of reads yields an intuitive understanding of the true source organism as Species I/Strain B (Figure 1b ). If Strain B were not present in the reference database, it would still be clear that the organism was an unknown strain of Species I.', 'cite_spans': [], 'ref_spans': [{'start': 244, 'end': 254, 'text': 'Figure 1a', 'ref_id': None}, {'start': 545, 'end': 555, 'text': '(Figure 1b', 'ref_id': None}], 'section': 'Name Method Reference MEGAN'}, {'text': 'The unique identification of a species or strain is quantified by the proportion of the genome that is determined to be species-or strain-specific (defined as reads that are aligned to regions that are species-or strain-specific). Each species and strain is then assigned a numerical measure of the proportion that is covered by these diagnostic reads, and that proportional measure is compared to the ideal case, where sequences from a single organism (generated in silico) are aligned against the database in an identical manner. After that normalization factor is applied, the resulting score indicates whether the source sample contained any of the organisms in the reference database. Figure 1 . A) For a group of strains belonging to two different species, some regions may be unique to each species (region 1), while other regions may be unique to strains within each species (regions 2 and 3). B) A set of reads are aligned to these genomes, and the ones that align in a species-or strain-specific manner are identified by the combination of genomes to which they align. In this example, Strain B of Species I is the organism identified.', 'cite_spans': [], 'ref_spans': [{'start': 690, 'end': 698, 'text': 'Figure 1', 'ref_id': None}], 'section': 'Name Method Reference MEGAN'}, {'text': 'The analysis is conducted independently on both the species and the strain level, so that if the true strain is not present in the database, the species of origin will still be identified. While many methods consider the complete taxonomic tree and assign reads to the least common ancestor, SIANN considers only two taxonomic levels: species and strain, throwing out anything that is not unique at one of those levels and thus obviating many of the confounding factors introduced by manually curated taxonomies.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'The example shown in Figure 1b indicates that species-specific reads are identified as reads that align to one species (Species I, in that case) but not the other. If Species II were not present in the example shown in Figure 1b , a much larger number of reads would be assigned as \"species-specific,\" when in fact those regions are shared with other species. Therefore, the ability of this method to identify strain-and species-specific sequences is a direct function of the inclusion of near neighbors in the reference database. This characteristic is shared among many classification algorithms, but it is of particular note for this method when users have an opportunity to construct their own database.. In order to detect a target species with a high degree of specificity (reducing false positives), it is necessary to include other related species in the reference database. Only by parallel alignment to those near neighbors can the redundant sequences be separated from the species-specific ones. For example, in order to detect Bacillus anthracis in a sample, it would be necessary to include other species of Bacilli in the reference database so that the presence of B. cereus or B. thuringiensis in a sample does not lead to a false call for B. anthracis.', 'cite_spans': [], 'ref_spans': [{'start': 21, 'end': 30, 'text': 'Figure 1b', 'ref_id': None}, {'start': 219, 'end': 228, 'text': 'Figure 1b', 'ref_id': None}], 'section': 'Name Method Reference MEGAN'}, {'text': 'The nomenclature of genus, species, and strain is potentially problematic because it does not correspond to a consistent degree of evolutionary distance or genomic distinctiveness. The ability to distinguish two organisms by any method using genomic sequence data is proportional to the amount of each genome that is shared or unique. One might assume that any two organisms of the same species will have a relatively predictable amount of shared genomic identity. However, some pairs of organisms from the same species may have less in common than other pairs of organisms from different species or even genera. This ambiguity impacts SIANN in two ways. If two organisms have very little genomic sequence to distinguish them, the sensitivity of SIANN to detect either one will diminish (the rate of false negatives will increase as the likelihood of sequencing unique regions decreases). Conversely, if an organism is extremely dissimilar to the near neighbors selected for the database, the specificity with which SIANN detects that organism will decline (the rate of false positives will increase as the number of related genomes available in the database decreases). For example, if a database contained only E. coli and B.anthracis, a sample containing B. cereus would be misidentified as contraining B. anthracis. In the intended use case, a database targeting B. anthracis would contain B. cereus and a number of other near neighbors to prevent that kind of misidentification. It would be convenient to say that an ideal database can be made by calculating the ideal genetic distance between all references and then finding an ideal set of organisms to make up that database, but the behavior of any database will be governed by the particular genomes of the organisms it encounters in the wild. Because not all organisms evolve in the same manner (differences in mutation rate, horizontal gene transfer, recombination, etc), the suitability of a database and method to detect a given organism can only be determined by thorough validation and benchmarking, as well as updating the reference database as needed. Users of SIANN may construct their own custom databases to include newly identified genomes or specific subsets of genomes that best suit their research interests.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'Steps to construct a custom database:', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': '1. Select a set of target organisms 2. Gather a set of genome sequences for those target organisms as well as a matched set of near neighbors 3. Using those reference genome sequences as an input, SIANN will: a. Construct a reference index for alignment b. Simulate a set of reads from each genome c. Align each of those simulated read sets to all of the reference genomes d. Calculate the proportion of each reference genome that is strain-or species-specific e. [If two organisms do not have a minimal amount of unique sequences that exceeds the rate of sequencing error, SIANN asks that all but one of those organisms are removed from the database to eliminate redundancy. Note that the user can provide a single representative genome with multiple strain names so that the redundant strain names are not lost.]', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'The files contained within each SIANN database are a compressed genomic index and a list containing the proportion of each reference genome that was found to be strainor species-specific during database construction.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'To run SIANN:', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': '1. Select a pre-made SIANN database and a set of sequences to be analyzed, and 2. SIANN will: a. Align each of the reads against the reference genomes b. Calculate the proportion of each reference genome that is strain-or species-specific within those reads c. Compare that proportion to the simulated ideal case generated during database creation . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint d. Calculate the probability that the given results could be generated by random chance e. Report the normalized proportion and non-parametric statistic of likelihood for each strain and species in the reference database. The normalized proportion of the genome covered by strain-or speciesspecific reads is the primary statistic reported by this tool.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'The performance of SIANN (version 1.6) was tested in comparison to the following stateof-the-art metagenomic classification programs: LMAT (version 1.2), MetaPhlAn (version 1.7.7), and Kraken (version 0.9.1b). All of the programs in Table 1 were investigated for this effort, and three were chosen based on their ability to run on our high-performance computing cluster with an execution time and memory requirement that would be suitable to a clinical lab. Each program was run on a set of 600 simulated datasets generated by MetaSim (Richter, et al., 2008) . Each dataset consisted of 15,000,000 reads (100bp single-ended) with Illumina-simulated error (fourth-degree polynomial) (Korbel, et al., 2009) . The 600 datasets were broken into 12 sets of 50 replicates. Each of the 12 sets contained organisms at different levels of abundance as shown in Table 2 .', 'cite_spans': [{'start': 535, 'end': 558, 'text': '(Richter, et al., 2008)', 'ref_id': 'BIBREF9'}, {'start': 682, 'end': 704, 'text': '(Korbel, et al., 2009)', 'ref_id': 'BIBREF5'}], 'ref_spans': [{'start': 233, 'end': 240, 'text': 'Table 1', 'ref_id': None}, {'start': 852, 'end': 859, 'text': 'Table 2', 'ref_id': None}], 'section': 'Benchmarking'}, {'text': 'Organisms were specifically chosen in pairs so that the ability to distinguish these near neighbors could be determined. The abundances were staggered at 4-fold intervals so that a wide range could be evaluated. All known species of near neighbors for each of the 12 target organisms were included in the reference database used by SIANN for this benchmarking (\"Target Pathogen Database\") and are shown in Appendix 1.', 'cite_spans': [], 'ref_spans': [], 'section': 'Benchmarking'}, {'text': 'Each program outputs a distinct measure. Kraken and LMAT both count the reads assigned to each taxon, MetaPhlAn calculates the abundance, and SIANN outputs a measure of the proportion of diagnostic genomic regions present. To put these measures on an even footing, we empirically calculated the false positive rate for each method over all 600 samples, at each possible measure of output. Because each dataset is made up of known organisms, any result can be classified as true or false. Therefore, for any possible result (say, 513 reads classified by LMAT or 1.6% abundance assigned by MetaPhlAn), one can calculate the proportion of calls with at least the same amount of support that were correct (True Positives/[True Positives+False Positives]), over all of the 600 datasets. That measure is commonly given as Positive Predictive Value (PPV). For each program, the results can be translated from the raw value into a PPV that is based on this empirical measure of error. The key item of interest is the PPV value for the results that we know to be true positives, the defined spike organisms. Another way of describing this approach is to say that the results of each program have been normalized to the false positive error rate that was empirically observed. If another set of samples were generated, the PPV vs. raw value curve ( Figure 2 ) would likely fall differently, but in this case it gives us a means of comparing a diverse set of methods against the same ground truth. If method 1 detects an organism with a higher PPV than method 2 does, it means that method 1 has fewer false positives in the range that it reports true positives, which is the definition of utility in this setting.', 'cite_spans': [], 'ref_spans': [{'start': 1339, 'end': 1347, 'text': 'Figure 2', 'ref_id': 'FIGREF0'}], 'section': 'Benchmarking'}, {'text': 'For each method, PPV was calculated as a function of raw output value. Briefly, this was done by compiling the output for all 600 samples, labeling each result as false or true based on the sample set that it came from, and then calculating (at each possible value of output) what the proportion of TP/[TP+FP] was for results with at least that level of raw output. Some simplification steps were taken, such as focusing on the specieslevel assignments (for comparison with methods that do not perform strain assignment), and only taking the top hit for each species from each dataset. Custom R and BASH scripts were used for the data compilation and analysis.', 'cite_spans': [], 'ref_spans': [], 'section': 'Benchmarking'}, {'text': 'The relationship of raw output value to PPV is shown for each of the four methods in Figure 2 . The point at which PPV is very close to 1 (where 95% of results are true positives) is ~41,000 reads for Kraken, ~2,800 reads for LMAT, ~38% abundance for MetaPhlAn, and 0.21 for SIANN. For SIANN this means that having 38% of the species-unique genome covered by reads resulted in the vast majority of calls being accurate. . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint For readassignment methods (such as LMAT and Kraken), manual inspection of the results may yield a different understanding of confidence than is presented here, or in any automated analysis.', 'cite_spans': [], 'ref_spans': [{'start': 85, 'end': 93, 'text': 'Figure 2', 'ref_id': 'FIGREF0'}], 'section': 'Results'}, {'text': 'For example, while each read that is assigned by LMAT and Kraken fall above a certain cutoff for speciesspecificity, some individual reads may be much more specific than others. One could identify a read that aligns to a single species of bacteria with 100% accuracy over its 300bp length, with the next closest match being only 90% similar. It is extremely unlikely that a 300bp exact match would arise due to random chance, and so the user could say with confidence that the organism of interest is found within the sequence data (not considering contamination, horizontal gene transfer, etc). However, such an approach is not currently implemented in an automated method, and many of the steps needed to make that assertion are performed manually by a domain expert, including alignment to near neighbors and ensuring that the read does not fall within a transposon, plasmid, etc. Therefore, while one could say that a single read is all that is needed to state with high PPV that an organism is present, the amount of reads assigned in an automated manner needed to achieve that level of PPV will number in the thousands (Fig 2) . Table 2 ), and each program (boxes at top), across a maximum of 50 replicates (indicated by the size of each point). Note that the reference database for MetaPhlAn does not include viruses, and the reference database for Kraken does not include RNA viruses (e.g., Hanta virus).', 'cite_spans': [], 'ref_spans': [{'start': 1125, 'end': 1132, 'text': '(Fig 2)', 'ref_id': 'FIGREF0'}, {'start': 1135, 'end': 1142, 'text': 'Table 2', 'ref_id': None}], 'section': 'Results'}, {'text': '. CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint The next phase of benchmarking was to determine how many raw input reads were needed to achieve the threshold for high PPV. To demonstrate this we plotted the known abundance of each spike organism against the PPV value generated by each method (Figure 3 ). Each point (an organism at a known level of abundance) is comprised of a maximum of 50 replicates, where the diameter of each point increases with an increasing number of replicates. For demonstration purposes we are showing two pairs of bacteria and three viruses. Recall that for each of the pairs of bacteria (and the two poxviruses) any sample containing one did not contain the other (as shown in Table 1 ). The empty boxes result from the organisms not being called at any abundance. For MetaPhlAn, that is a result of no viruses being included in the version of the reference database available for this analysis. Kraken assigned no reads to Hanta virus because viral RNA genomes were not included in this version of the reference database (personal communication with D. Wood). This emphasizes the point that a) the ability to create custom databases targeting organisms of interest can be valuable, and b) the performance of any method must be benchmarked against each potential target of interest.', 'cite_spans': [], 'ref_spans': [{'start': 460, 'end': 469, 'text': '(Figure 3', 'ref_id': 'FIGREF1'}, {'start': 875, 'end': 882, 'text': 'Table 1', 'ref_id': None}], 'section': 'Results'}, {'text': 'All methods were able identify the bulk of organisms in their databases at high abundances (75% and 18%, Figure 3 ), however performance varied considerably at lower abundances and depended on the particular organism and method used. SIANN detected each organism at high confidence, even at levels as low as 0.3% and 0.07% of the total.', 'cite_spans': [], 'ref_spans': [{'start': 105, 'end': 113, 'text': 'Figure 3', 'ref_id': 'FIGREF1'}], 'section': 'Results'}, {'text': \"The process of detecting trace amounts of a specific organism in a complex mixture of DNA is challenging enough for an expert, but that pales in comparison to the difficulty of accomplishing the same certainty of detection in an automated manner. The results presented here show that SIANN rapidly detects the presence of a given set of organisms with a high degree of specificity and sensitivity. For example, at the 95% confidence (PPV) cutoff of 0.2, SIANN reliably detects all of the organisms tested here at as low as 0.3% abundance. This strong performance is likely due to the fact that SIANN is able to use a method (read alignment to whole genomes) that would be far too computationally costly if it were applied to the entire collection of known genomes. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results. SIANN is available on Illumina's BaseSpace (www.basespace.illumina.com) as a NativeApp, with the database tested here (Appendix 1), as well as a database made from the NCBI representative set of prokaryotic genomes (ftp://ftp.ncbi.nlm.nih.gov/genomes/genome_reports/) (Appendix 2) and the complete set of NCBI viral genomes (ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/) (Appendix 3).\", 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': \". CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint BaseSpace was chosen as an appropriate release platform because while the entire set of software and dependencies can be deployed by the user from within a graphical user interface, the actual computation takes place in a controlled 'cloud' environment. Such a distribution strategy obviates the need to satisfy the multiple software or OS dependencies that often arises with academic computational methods. Results for SIANN are compiled into a report format, showing both the organisms that surpass 95% confidence, as well as the closest strain match for each species. The default view masks the raw data output, so that the results are human-readable and do not present extraneous information. While the code for execution and databaseconstruction on a users system is available from Signature Science, LLC, additional databases on the BaseSpace platform can be made available upon request.\", 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'There is a neverending list of questions that one could ask of metagenomic sequencing data generated from important samples. Instead of answering them all, we demonstrate a technique with a very narrow focus that is able to report with a high degree of confidence whether a given set of organisms is present in a sample. These results are presented to the user in a comprehensible format, and accessible on a commonly-used web platform. The world of bioinformatics will continue to progress and develop more sophisticated tools for metagenomic analysis, and we hope that the utility of SIANN will convince others to package and benchmark their tools in a way that they can be used with confidence by the larger public, as well as the research community.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': '. CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': 'Scalable metagenomic taxonomy classification using a reference genome database', 'authors': [{'first': 'S', 'middle': ['K'], 'last': 'Ames', 'suffix': ''}, {'first': 'D', 'middle': ['A'], 'last': 'Hysom', 'suffix': ''}, {'first': 'S', 'middle': ['N'], 'last': 'Gardner', 'suffix': ''}, {'first': 'G', 'middle': ['S'], 'last': 'Lloyd', 'suffix': ''}, {'first': 'M', 'middle': ['B'], 'last': 'Gokhale', 'suffix': ''}, {'first': 'Allen', 'middle': [], 'last': 'Je', 'suffix': ''}], 'year': 2013, 'venue': 'Bioinformatics', 'volume': '29', 'issn': '18', 'pages': '2253--60', 'other_ids': {'DOI': ['10.1093/bioinformatics/btt389']}}, 'BIBREF1': {'ref_id': 'b1', 'title': 'A comparative evaluation of sequence classification programs', 'authors': [{'first': 'A', 'middle': ['L'], 'last': 'Bazinet', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Cummings', 'suffix': ''}], 'year': 2012, 'venue': 'BMC Bioinformatics', 'volume': '13', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1186/1471-2105-13-92']}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'Rapid phylogenetic and functional classification of short genomic fragments with signature peptides', 'authors': [{'first': 'J', 'middle': [], 'last': 'Berendzen', 'suffix': ''}, {'first': 'W', 'middle': ['J'], 'last': 'Bruno', 'suffix': ''}, {'first': 'J', 'middle': ['D'], 'last': 'Cohn', 'suffix': ''}, {'first': 'N', 'middle': ['W'], 'last': 'Hengartner', 'suffix': ''}, {'first': 'C', 'middle': ['R'], 'last': 'Kuske', 'suffix': ''}, {'first': 'B', 'middle': ['H'], 'last': 'Mcmahon', 'suffix': ''}, {'first': 'M', 'middle': ['A'], 'last': 'Wolinsky', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Xie', 'suffix': ''}], 'year': 2012, 'venue': 'BMC Research Notes', 'volume': '5', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1186/1756-0500-5-460']}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Phymm and PhymmBL: metagenomic phylogenetic classification with interpolated Markov models', 'authors': [{'first': 'A', 'middle': [], 'last': 'Brady', 'suffix': ''}, {'first': 'S', 'middle': ['L'], 'last': 'Salzberg', 'suffix': ''}], 'year': 2009, 'venue': 'Nature Methods', 'volume': '6', 'issn': '9', 'pages': '673--679', 'other_ids': {'DOI': ['10.1038/nmeth.1358']}}, 'BIBREF4': {'ref_id': 'b4', 'title': 'Integrative analysis of environmental sequences using MEGAN4', 'authors': [{'first': 'D', 'middle': ['H'], 'last': 'Huson', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Mitra', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Weber', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Ruscheweyh', 'suffix': ''}, {'first': 'S', 'middle': ['C'], 'last': 'Schuster', 'suffix': ''}], 'year': 2011, 'venue': 'Genome Research', 'volume': '21', 'issn': '', 'pages': '1552--1560', 'other_ids': {}}, 'BIBREF5': {'ref_id': 'b5', 'title': 'PEMer: a computational framework with simulation-based error models for inferring genomic structural variants from massive paired-end sequencing data', 'authors': [{'first': 'J', 'middle': ['O'], 'last': 'Korbel', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Abyzov', 'suffix': ''}, {'first': 'X', 'middle': ['J'], 'last': 'Mu', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Carriero', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Cayting', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Snyder', 'suffix': ''}, {'first': 'M', 'middle': ['B'], 'last': 'Gerstein', 'suffix': ''}], 'year': 2009, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'Fast gapped-read alignment with Bowtie 2', 'authors': [{'first': 'B', 'middle': [], 'last': 'Langmead', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Salzberg', 'suffix': ''}], 'year': 2012, 'venue': 'Nature Methods', 'volume': '9', 'issn': '', 'pages': '357--359', 'other_ids': {}}, 'BIBREF8': {'ref_id': 'b8', 'title': 'Accurate and fast estimation of taxonomic profiles from metagenomic shotgun sequences', 'authors': [{'first': 'B', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Gibbons', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Ghodsi', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Treangen', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Pop', 'suffix': ''}], 'year': 2011, 'venue': 'BMC Genomics', 'volume': '12', 'issn': '2', 'pages': '', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'MetaSim: a sequencing simulator for genomics and metagenomics', 'authors': [{'first': 'D', 'middle': ['C'], 'last': 'Richter', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Ott', 'suffix': ''}, {'first': 'A', 'middle': ['F'], 'last': 'Auch', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Schmid', 'suffix': ''}, {'first': 'D', 'middle': ['H'], 'last': 'Huson', 'suffix': ''}], 'year': 2008, 'venue': 'PLoS One', 'volume': '3', 'issn': '10', 'pages': '', 'other_ids': {'DOI': ['10.1371/journal.pone.0003373']}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Genomic variation landscape of the human gut microbiome', 'authors': [{'first': 'S', 'middle': [], 'last': 'Schloissnig', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Arumugam', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Sunagawa', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Mitreva', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Tap', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Zhu', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Waller', 'suffix': ''}, {'first': 'D', 'middle': ['R'], 'last': 'Mende', 'suffix': ''}, {'first': 'J', 'middle': ['R'], 'last': 'Kultima', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Martin', 'suffix': ''}, {'first': 'K', 'middle': [], 'last': 'Kota', 'suffix': ''}, {'first': 'S', 'middle': ['R'], 'last': 'Sunyaev', 'suffix': ''}, {'first': 'G', 'middle': ['M'], 'last': 'Weinstock', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Bork', 'suffix': ''}], 'year': 2013, 'venue': 'Nature', 'volume': '493', 'issn': '7430', 'pages': '45--50', 'other_ids': {'DOI': ['10.1038/nature11711']}}}, 'ref_entries': {'FIGREF0': {'text': 'Relationship of reported value for each program (horizontal axis, log scale) to the empirically-determined Positive Predictive Value (PPV), shown on the vertical axis. While the exact values depend on the test data used, the general values at significant cutoff values (0.8, 0.9, 0.95 PPV) remain relatively constant across different datasets (data not shown).', 'latex': None, 'type': 'figure'}, 'FIGREF1': {'text': 'The Positive Predictive Value (PPV, vertical axis) is shown for each organism (boxes on right), at each level of known abundance (horizontal axis, see', 'latex': None, 'type': 'figure'}, 'TABREF0': {'text': '. Summary of methods for metagenomic classification.', 'latex': None, 'type': 'table'}, 'TABREF1': {'text': 'Table 2. The abundance of each target organism in each set of simulated datasets. Each set is indicated by the number in the top row, and was generated with 50 replicates.', 'latex': None, 'type': 'table'}, 'TABREF2': {'text': 'Segata N, Waldron L, Ballarini A, Narasimhan V,Jousson O, and Huttenhower C (2012). Metagenomic microbial community profiling using unique clade-specific marker genes. NatureMethods, 9(8):811-4. doi: 10.1038/nmeth.2066. Sunagawa S, Mende DR, Zeller G, Izquierdo-Carrasco F, Berger SA, Kultima JR, Coelho LP, Arumugam M, Tap J, Nielsen HB, Rasmussen S, Brunak S, Pedersen O, Guarner F, de Vos WM, Wang J, Li J, Doré J, Ehrlich SD, Stamatakis A & Bork P (2013). Metagenomic species profiling using universal phylogenetic marker genes. Nature Methods, 10, 1196-1199 (2013) doi:10.1038/nmeth.2693 Wood DE and Salzberg SL. Ultrafast metagenomic sequence classification using exact alignments. In submission. Wu GD, Chen J, Hoffmann C, Bittinger K, Chen YY, Keilbaugh SA, Bewtra M, Knights D, Walters WA, Knight R, Sinha R, Gilroy E, Gupta K, Baldassano R, Nessel L, Li H, Bushman FD, Lewis JD (2011). Linking long-term dietary patterns with gut microbial enterotypes. Science. 334(6052):105-8. doi: 10.1126/science.1208344. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Appendix 2: Viral Database Abaca bunchy top virus DNA-C Abaca bunchy top virus DNA-M Abaca bunchy top virus DNA-The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101Circovirus-like genome BBC-A Circovirus-like genome CB-A Circovirus-like genome CB-B Circovirus-like genome RW-A Circovirus-like genome RW-B Circovirus-like genome RW-C Circovirus-like genome RW-D Circovirus-like genome RW-E Circovirus-like genome SAR-A Circovirus-like genome SAR-B Citrus psorosis virus RNA1 Citrus psorosis virus RNA2 Citrus psorosis virus RNA3The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprintThe copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprintThe copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder.It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Thermovirga lienii DSM 17291 . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Thermus scotoductus SA 01 Thermus sp CCB US3 UF1 Thermus thermophilus HB27 Vibrio cholerae O1 biovar El Tor str N16961 Vibrio fischeri ES114 Vibrio furnissii NCTC 11218 Vibrio harveyi ATCC BAA 1116 . CC-BY-NC-ND 4.0 International license is made available under a', 'latex': None, 'type': 'table'}}, 'back_matter': []}\n", - "{'paper_id': 'daf32e013d325a6feb80e83d15aabc64a48fae33', 'metadata': {'title': 'Spatial epidemiology of networked metapopulation: An overview', 'authors': [{'first': 'Lin', 'middle': [], 'last': 'Wang', 'suffix': '', 'affiliation': {'laboratory': 'Adaptive Networks and Control Laboratory', 'institution': 'Fudan University', 'location': {'postCode': '200433', 'settlement': 'Shanghai', 'country': 'China'}}, 'email': ''}, {'first': '·', 'middle': ['Xiang'], 'last': 'Li', 'suffix': '', 'affiliation': {'laboratory': 'Adaptive Networks and Control Laboratory', 'institution': 'Fudan University', 'location': {'postCode': '200433', 'settlement': 'Shanghai', 'country': 'China'}}, 'email': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'The University of Hong Kong', 'location': {'settlement': 'Hong Kong SAR', 'country': 'China'}}, 'email': ''}, {'first': '·', 'middle': ['X'], 'last': 'Li', 'suffix': '', 'affiliation': {'laboratory': 'Centre for Chaos and Complex Networks', 'institution': 'City University of Hong Kong', 'location': {'addrLine': 'Li Ka Shing', 'settlement': 'Hong Kong SAR', 'country': 'China'}}, 'email': ''}]}, 'abstract': [{'text': 'An emerging disease is one infectious epidemic caused by a newly transmissible pathogen, which has either appeared for the first time or already existed in human populations, having the capacity to increase rapidly in incidence as well as geographic range. Adapting to human immune system, emerging diseases may trigger large-scale pandemic spreading, such as the transnational spreading of SARS, the global outbreak of A(H1N1), and the recent potential invasion of avian influenza A(H7N9). To study the dynamics mediating the transmission of emerging diseases, spatial epidemiology of networked metapopulation provides a valuable modeling framework, which takes spatially distributed factors into consideration. This review elaborates the latest progresses on the spatial metapopulation dynamics, discusses empirical and theoretical findings that verify the validity of networked metapopulations, and the application in evaluating the effectiveness of disease intervention strategies as well.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': 'The term metapopulation was coined by Levins [1] in 1969 to describe a population dynamics model of insect pests in farmlands, yet the perspective has been broadly applied to study the effect of spatially distributed factors on evolutionary dynamics [2] , including genetic drift, pattern formation, extinction and recolonization, etc. The development of metapopulation theory, in conjunction with the fast development of complex networks theory, lead to the innovative application of the networked metapopulation in modeling largescale spatial transmission of emerging diseases. This interdisciplinary research field has attracted much attention by the scientific communities from diverse disciplines, such as public health, mathematical biology, statistical physics, information science, sociology, and complexity science. New insights are contributed to understanding the spatial dynamics of epidemic spreading, which provides valuable support to public healthcare.', 'cite_spans': [{'start': 45, 'end': 48, 'text': '[1]', 'ref_id': 'BIBREF0'}, {'start': 250, 'end': 253, 'text': '[2]', 'ref_id': 'BIBREF1'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'This review presents a survey of recent advances in the emergent discipline of networked metapopulation epidemiology, which is organized as follows. Section 2 introduces some preliminaries of the compartment model, network epidemiology, and networked metapopulation, and also elucidates their relevance. Section 3 specifies the validity of networked metapopulation. Section 4 focuses on the recent progresses on metapopulation dynamics. The application in evaluating the performance of intervention strategies is presented in Section 5, and some outlooks are provided at last. models have been proposed [3, 4] . The compartment model is one of the simplest yet basic epidemic models, which was first introduced by Bernoulli [5] in the 18th century.', 'cite_spans': [{'start': 603, 'end': 606, 'text': '[3,', 'ref_id': 'BIBREF2'}, {'start': 607, 'end': 609, 'text': '4]', 'ref_id': 'BIBREF3'}, {'start': 724, 'end': 727, 'text': '[5]', 'ref_id': 'BIBREF4'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Assuming that a population of individuals is mixed homogeneously, this model organizes the persons into different compartments (states), according to their health status, e.g., susceptible (denoted by S, those healthy ones who may acquire the infection), infectious (I, those infected ones who are contagious), and recovered (R, those who are recovered from the disease). Within each compartment, all individuals are identical. The transitions between different compartments depend on the specific transition rates. For example, the transmission rate β represents the infection probability for a susceptible individual that encounters an infectious person, and the recovery rate µ represents the probability with which an infectious individual is recovered. If the disease could not endow recovered persons with a long lasting immunity but infect them again, e.g., seasonal flu, asthma, gonorrhoea, the related epidemic reactions are well described by the so called SIS model; otherwise, if recovered people become immune permanently to the disease, e.g., pandemic influenza, pertussis, smallpox, the epidemic dynamics can be characterized by the SIR model properly. Figures 1(a) -(b) illustrate the relevant compartment transitions in the SIS and SIR models, respectively. The dynamical evolution of these models can be simply delineated by ordinary differential equations [3] .', 'cite_spans': [{'start': 1374, 'end': 1377, 'text': '[3]', 'ref_id': 'BIBREF2'}], 'ref_spans': [{'start': 1167, 'end': 1179, 'text': 'Figures 1(a)', 'ref_id': 'FIGREF0'}], 'section': 'Introduction'}, {'text': 'One key parameter characterizing the severity of a disease is the basic reproductive number, R 0 , which identifies the expected number of infected individuals generated by introducing an infectious carrier into an entire susceptible population. This parameter signifies the epidemic threshold applied for predicting whether or not an infectious disease will prevail. Typically, given a \"well-mixed\" population, R 0 = β/µ. If R 0 < 1, the disease dies out quickly, which implies that the population remains at the disease-free state.', 'cite_spans': [], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Due to the ubiquity of complex systems in modern society, the study of complex networks becomes prosperous [6] [7] [8] [9] . The Internet and human friendship networks are just a few examples that can be regarded as systems comprised of a large number of connected dynamical units. The most intuitive approach of modeling such complex systems is to treat them as networks, where nodes represent component units and edges represent connectivity. Importantly, empirical findings have unraveled the presence of universal features in most socio-technical networks, e.g., small-world [10] , scale-free (SF) [11] , which inspires extensive studies towards a better understanding about the impact of population infrastructures (network connectivity) on dynamical processes [12] [13] [14] [15] , including robustness [16, 17] , synchronization [18] [19] [20] , consensus [21] [22] [23] [24] , control [25] [26] [27] [28] , evolutionary game [29] [30] [31] [32] [33] [34] [35] [36] , traffic routing [37] [38] [39] , selforganized criticality [40] [41] [42] [43] , etc.', 'cite_spans': [{'start': 107, 'end': 110, 'text': '[6]', 'ref_id': 'BIBREF5'}, {'start': 111, 'end': 114, 'text': '[7]', 'ref_id': 'BIBREF6'}, {'start': 115, 'end': 118, 'text': '[8]', 'ref_id': 'BIBREF7'}, {'start': 119, 'end': 122, 'text': '[9]', 'ref_id': 'BIBREF8'}, {'start': 579, 'end': 583, 'text': '[10]', 'ref_id': 'BIBREF9'}, {'start': 602, 'end': 606, 'text': '[11]', 'ref_id': 'BIBREF10'}, {'start': 766, 'end': 770, 'text': '[12]', 'ref_id': 'BIBREF11'}, {'start': 771, 'end': 775, 'text': '[13]', 'ref_id': 'BIBREF12'}, {'start': 776, 'end': 780, 'text': '[14]', 'ref_id': 'BIBREF13'}, {'start': 781, 'end': 785, 'text': '[15]', 'ref_id': 'BIBREF14'}, {'start': 809, 'end': 813, 'text': '[16,', 'ref_id': 'BIBREF15'}, {'start': 814, 'end': 817, 'text': '17]', 'ref_id': 'BIBREF16'}, {'start': 836, 'end': 840, 'text': '[18]', 'ref_id': 'BIBREF17'}, {'start': 841, 'end': 845, 'text': '[19]', 'ref_id': 'BIBREF18'}, {'start': 846, 'end': 850, 'text': '[20]', 'ref_id': 'BIBREF19'}, {'start': 863, 'end': 867, 'text': '[21]', 'ref_id': 'BIBREF20'}, {'start': 868, 'end': 872, 'text': '[22]', 'ref_id': 'BIBREF21'}, {'start': 873, 'end': 877, 'text': '[23]', 'ref_id': 'BIBREF22'}, {'start': 878, 'end': 882, 'text': '[24]', 'ref_id': 'BIBREF23'}, {'start': 893, 'end': 897, 'text': '[25]', 'ref_id': 'BIBREF24'}, {'start': 898, 'end': 902, 'text': '[26]', 'ref_id': 'BIBREF25'}, {'start': 903, 'end': 907, 'text': '[27]', 'ref_id': 'BIBREF26'}, {'start': 908, 'end': 912, 'text': '[28]', 'ref_id': 'BIBREF27'}, {'start': 933, 'end': 937, 'text': '[29]', 'ref_id': 'BIBREF28'}, {'start': 938, 'end': 942, 'text': '[30]', 'ref_id': 'BIBREF29'}, {'start': 943, 'end': 947, 'text': '[31]', 'ref_id': 'BIBREF30'}, {'start': 948, 'end': 952, 'text': '[32]', 'ref_id': 'BIBREF31'}, {'start': 953, 'end': 957, 'text': '[33]', 'ref_id': 'BIBREF32'}, {'start': 958, 'end': 962, 'text': '[34]', 'ref_id': 'BIBREF33'}, {'start': 963, 'end': 967, 'text': '[35]', 'ref_id': 'BIBREF34'}, {'start': 968, 'end': 972, 'text': '[36]', 'ref_id': 'BIBREF35'}, {'start': 991, 'end': 995, 'text': '[37]', 'ref_id': 'BIBREF36'}, {'start': 996, 'end': 1000, 'text': '[38]', 'ref_id': 'BIBREF37'}, {'start': 1001, 'end': 1005, 'text': '[39]', 'ref_id': 'BIBREF38'}, {'start': 1034, 'end': 1038, 'text': '[40]', 'ref_id': 'BIBREF39'}, {'start': 1039, 'end': 1043, 'text': '[41]', 'ref_id': 'BIBREF40'}, {'start': 1044, 'end': 1048, 'text': '[42]', 'ref_id': 'BIBREF41'}, {'start': 1049, 'end': 1053, 'text': '[43]', 'ref_id': 'BIBREF42'}], 'ref_spans': [], 'section': 'Network epidemiology.'}, {'text': 'Assuming that interactive individuals are mixed homogeneously, the aforementioned epidemic compartment model neglects the significance of population connectivity. Such simplification can hardly solve new puzzles emerged in the present networking society. For example, why is it extremely difficult to eradicate computer viruses from the Internet or the World Wide Web, and why do those viruses have an unusual long lifetime [44] ? Similar matters have been observed in diverse systems, ranging from the web of human sexual relations to vaccination campaigns [4] . One key factor inducing such problems is the scale-free property of the networked systems, which causes a serious trouble that the threshold of disease outbreak vanishes [45] . Within complex networks, the basic reproductive number is R sf 0 = R 0 [1 + (CV ) 2 ], with CV identifying the coefficient of variation of the degree distribution (degree represents the number of edges k per node) [46] . For large networks taking on a scalefree heterogeneous topology, R 0 is always larger than 1 no matter how small the transmission rate may be, due to the infinite variance of the degree distribution.', 'cite_spans': [{'start': 424, 'end': 428, 'text': '[44]', 'ref_id': 'BIBREF43'}, {'start': 558, 'end': 561, 'text': '[4]', 'ref_id': 'BIBREF3'}, {'start': 734, 'end': 738, 'text': '[45]', 'ref_id': 'BIBREF44'}, {'start': 955, 'end': 959, 'text': '[46]', 'ref_id': 'BIBREF45'}], 'ref_spans': [], 'section': 'Network epidemiology.'}, {'text': 'This meaningful finding has motivated the research of network epidemiology, which concerns particularly the spreading of epidemics in human social networks [4, 14, 47] . Many subsequent works investigated extensively the epidemic threshold on networks with special topological features, such as degree correlations [48] , small world [49] , community [50] , edge length [51] , and K-core [52] . [53, 54] demonstrated that the vanishing epidemic threshold of the SIS model derives from the active behavior of the largest hub, which acts as a self-sustained source of the infection. Such disastrous effect of highly connected hubs can also be observed in reality, such as the presence of core groups in the propagation of sexually transmitted diseases, and the appearance of the patient zero that induces the dissemination of human immunodeficiency virus (HIV). Considering that the threshold condition generally predicts the final state of the epidemic evolution, Li and Wang [49] studied the relaxation behavior of epidemic spreading before reaching a final disease-free or endemic phase.', 'cite_spans': [{'start': 156, 'end': 159, 'text': '[4,', 'ref_id': 'BIBREF3'}, {'start': 160, 'end': 163, 'text': '14,', 'ref_id': 'BIBREF13'}, {'start': 164, 'end': 167, 'text': '47]', 'ref_id': 'BIBREF46'}, {'start': 315, 'end': 319, 'text': '[48]', 'ref_id': 'BIBREF47'}, {'start': 334, 'end': 338, 'text': '[49]', 'ref_id': 'BIBREF48'}, {'start': 351, 'end': 355, 'text': '[50]', 'ref_id': 'BIBREF49'}, {'start': 370, 'end': 374, 'text': '[51]', 'ref_id': 'BIBREF50'}, {'start': 388, 'end': 392, 'text': '[52]', 'ref_id': 'BIBREF51'}, {'start': 395, 'end': 399, 'text': '[53,', 'ref_id': 'BIBREF52'}, {'start': 400, 'end': 403, 'text': '54]', 'ref_id': 'BIBREF53'}, {'start': 975, 'end': 979, 'text': '[49]', 'ref_id': 'BIBREF48'}], 'ref_spans': [], 'section': 'Network epidemiology.'}, {'text': \"Although the performance of public healthcare systems has been improved prominently to weaken the threat of emerging diseases, it is impossible to entail a world free of infectious pathogens [55] . From the beginning of this new century, we have already witnessed several cases of the large-scale geographic transmission of pandemics. In 2003, through the international airline network, the SARS coronavirus (SARS-CoV) was rapidly transmitted from Hong Kong to more than 30 countries [56, 57] . Several years later, in 2009, the A(H1N1) swept across the world through public transportation networks again: With only 3 to 4 months, it had spread over about 200 countries [58] [59] [60] [61] . Recent potential invasion of avian influenza A(H7N9) poses a new challenge [62] [63] [64] [65] . It seems that the widespread risk of emerging diseases is higher than before. This urgent circumstance stems from the changes of human social ecology in population distribution as well as human mobility patterns [66, 67] . Crowded metropolises resulting from the urbanization process induce people's frequent contacts, and the fast development of massive transportation (e.g., civil aviation) generates a nonlocal pattern of human mobility, sharply reducing the time of travel as well as the distance between populous cities.\", 'cite_spans': [{'start': 191, 'end': 195, 'text': '[55]', 'ref_id': 'BIBREF54'}, {'start': 484, 'end': 488, 'text': '[56,', 'ref_id': 'BIBREF55'}, {'start': 489, 'end': 492, 'text': '57]', 'ref_id': 'BIBREF56'}, {'start': 670, 'end': 674, 'text': '[58]', 'ref_id': 'BIBREF57'}, {'start': 675, 'end': 679, 'text': '[59]', 'ref_id': 'BIBREF58'}, {'start': 680, 'end': 684, 'text': '[60]', 'ref_id': 'BIBREF59'}, {'start': 685, 'end': 689, 'text': '[61]', 'ref_id': 'BIBREF60'}, {'start': 767, 'end': 771, 'text': '[62]', 'ref_id': 'BIBREF61'}, {'start': 772, 'end': 776, 'text': '[63]', 'ref_id': 'BIBREF62'}, {'start': 777, 'end': 781, 'text': '[64]', 'ref_id': 'BIBREF63'}, {'start': 782, 'end': 786, 'text': '[65]', 'ref_id': 'BIBREF64'}, {'start': 1001, 'end': 1005, 'text': '[66,', 'ref_id': 'BIBREF65'}, {'start': 1006, 'end': 1009, 'text': '67]', 'ref_id': 'BIBREF66'}], 'ref_spans': [], 'section': 'Networked metapopulation.'}, {'text': 'It is not convincing to describe the large-scale spatial pandemic spreading by directly following the routine of network epidemiology, since the network perspective still concerns the epidemic outbreak in a single population, despite considering the connectivity structure among hosts. This can hardly capture the key features of spatial transmission of infectious diseases: epidemics prevails inside separate locations such as cities, each of which can be regarded as a pop-ulation, and is transmitted among populations through the travel of infected individuals.', 'cite_spans': [], 'ref_spans': [], 'section': 'Networked metapopulation.'}, {'text': 'Spatial distribution of populations and human mobility among connected locations are the pivotal elements mediating the transmission of pandemic diseases. To introduce spatially distributed factors into modeling substrates, it is intuitive to generalize the network model by defining each node as a subpopulation that has a specific location, in which a population of individuals interplays according to the compartment rule. People are also permitted to transfer among subpopulations through mobility networks. This individualnetwork frame organizes the entire system into networked populations, leading to an important class of model in modern epidemiology, namely, the networked metapopulation. Figure 2 illustrates the basic modeling structure.', 'cite_spans': [], 'ref_spans': [{'start': 698, 'end': 706, 'text': 'Figure 2', 'ref_id': 'FIGREF1'}], 'section': 'Networked metapopulation.'}, {'text': 'Aside from the above conceptual descriptions, it is also essential to verify the validity of the model from theoretical as well as empirical perspectives.', 'cite_spans': [], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': 'Developing a probabilistic metapopulation with the consideration of long-range human migrations via a worldwide aviation network composed of 500 largest airports, Hufnagel et al. [68] first demonstrated the feasibility of forecasting the real-world transmission of SARS through computational approaches. To study the spatiotemporal patterns of the transmission process, Colizza et al. [69] defined a statistical measure based on the information entropy, which quantifies the disorder level encoded in the evolution profiles of disease prevalence. Comparing the pandemic spreading on a datadriven networked metapopulation with that on random reshuffled models providing null hypotheses, the authors unveiled the presence of a high-level heterogeneity in the geographic transmission of epidemics. author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 179, 'end': 183, 'text': '[68]', 'ref_id': 'BIBREF67'}, {'start': 385, 'end': 389, 'text': '[69]', 'ref_id': 'BIBREF68'}], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/003889 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': 'To assess the predictability of metapopulation models, one typical approach focuses on the coincident extent between the simulation results and the realistic surveillance reports for each contaminated region, which is an arduous task due to the sophisticated calibration of parameters as well as the unavoidable noise presented in the surveillance process. Concerning the logistical feasibility of the model, one can resort to an alternative simple means of inspecting the evolution of related scaling laws [70] , which is relevant to critical transition patterns. The scaling theory concerns the functional relations describing the data collapsing onto a power-law curve, and the relations of the critical-point exponents [71] .', 'cite_spans': [{'start': 507, 'end': 511, 'text': '[70]', 'ref_id': 'BIBREF69'}, {'start': 723, 'end': 727, 'text': '[71]', 'ref_id': 'BIBREF70'}], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': \"The Zipf's law and the Heaps' law are two representative scaling laws that usually emerge together in various complex systems, however, their joint emergence has hardly been clarified [72] . Using the data of laboratory confirmed cases of SARS, H5N1, and A(H1N1) to analyze the joint emergence of these two scalings in the evolution process of large-scale geographic transmissions, Wang et al. [70] unraveled a universal feature that the Zipf's law and the Heaps' law are naturally shaped to coexist at the initial stage of an outbreak, while a crossover comes with their incoherence later before reaching a stable state, where the Heaps' law still presents with the wane of the strict Zipf's law. With the census populations and domestic air transportation data of the United States (US) [73, 74] , a data-driven metapopulation network model on the US country level is developed to analyze the evolution patterns of scaling emergence. In contrast with a random reshuffled model with a homogeneous structure, the data-driven heterogeneous metapopulation successfully reproduced the scaling transitions observed in the real-world pandemics. This demonstrates that the high-level heterogeneity of infrastructure plays a key role in characterizing the spatial transmission of infectious diseases, which also provides a new insight to clarifying the interdependence between the Zipf's and Heaps' scaling laws.\", 'cite_spans': [{'start': 184, 'end': 188, 'text': '[72]', 'ref_id': 'BIBREF71'}, {'start': 394, 'end': 398, 'text': '[70]', 'ref_id': 'BIBREF69'}, {'start': 789, 'end': 793, 'text': '[73,', 'ref_id': 'BIBREF72'}, {'start': 794, 'end': 797, 'text': '74]', 'ref_id': None}], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': 'Within each subpopulation, the individuals are mixed homogeneously, according to the coarse-grained approximation of the metapopulation framework. Interestingly, this assumption can be supported by recent empirical studies on the intra-urban human mobility. The analysis of the data generated by the mobile phone or GPS shows that human movement in the urban scale (e.g., inside a city) generally has an exponential or binomial trip-length distribution [75] [76] [77] [78] [79] . Although this does not simply mean that short-range human mobility is random, the related dynamical feature is similar with that of the Boltzmann gas, if the relevance among individuals is so weak as to be negligible [76, 80, 81] . Accordingly, the homogeneous mixing (within each subpopulation) assumption is adopted to ease the computation.', 'cite_spans': [{'start': 453, 'end': 457, 'text': '[75]', 'ref_id': 'BIBREF73'}, {'start': 458, 'end': 462, 'text': '[76]', 'ref_id': 'BIBREF74'}, {'start': 463, 'end': 467, 'text': '[77]', 'ref_id': 'BIBREF75'}, {'start': 468, 'end': 472, 'text': '[78]', 'ref_id': 'BIBREF76'}, {'start': 473, 'end': 477, 'text': '[79]', 'ref_id': 'BIBREF77'}, {'start': 697, 'end': 701, 'text': '[76,', 'ref_id': 'BIBREF74'}, {'start': 702, 'end': 705, 'text': '80,', 'ref_id': 'BIBREF78'}, {'start': 706, 'end': 709, 'text': '81]', 'ref_id': 'BIBREF79'}], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': 'More promisingly, full-scales computational models become increasingly popular, due to the continuous increase of computer power as well as the fast technical developments of data collection and processing [82] [83] [84] . In some cases, the real-time forecast of pandemic spreading is becoming reality [85] . Technical details for the estimation and validation of a large number of parameters in these models are beyond the interest of this review. Next section focuses on the recent theoretical progress of metapopulation dynamics.', 'cite_spans': [{'start': 206, 'end': 210, 'text': '[82]', 'ref_id': None}, {'start': 211, 'end': 215, 'text': '[83]', 'ref_id': 'BIBREF81'}, {'start': 216, 'end': 220, 'text': '[84]', 'ref_id': 'BIBREF82'}, {'start': 303, 'end': 307, 'text': '[85]', 'ref_id': 'BIBREF83'}], 'ref_spans': [], 'section': 'Validity of networked metapopulation'}, {'text': 'As stated in Section 2, the networked metapopulation model is constructed with the individual-network frame, where the individuals are organized into social units (e.g., villages, towns, cities) defined as subpopulations, which are connected by transportation networks that identify the mobility routes. The disease prevails inside each subpopulation due to interpersonal contacts, and is transmitted among subpopulations through the mobility of infected individuals. Typically, the model is comprised of two scales of dynamics: (i) disease invasion among different subpopulations; (ii) disease reaction within each subpopulation. Recent progresses on these two aspects are specified here.', 'cite_spans': [], 'ref_spans': [], 'section': 'Two scales of dynamics: Recent progress'}, {'text': 'The substrate of metapopulation depends on the spatial structure of social environment, such as transport infrastructures and mobility patterns. The lack of fine-grained data capturing structural features of human mobility systems leads to the traditional application of random graphs or regular lattices, which assumes homogeneous infrastructures for the mobility substrates. To generalize metapopulation models with network approaches, the first attempt was contributed by Rvachev and Longini [86] , in which 52 major cities worldwide in that epoch were connected through an intercity aviation transportation network. They applied this mathematical model to simulate the global spread of the 1968-1969 Hong Kong (H3N2) flu.', 'cite_spans': [{'start': 495, 'end': 499, 'text': '[86]', 'ref_id': 'BIBREF84'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'Subsequently, comparing the effect of non-local human anomalous diffusion with that of the ordinary diffusion behavior, Brockmann et al. unraveled that long-range human mobility and interactions generate novel irregular spreading patterns without an apparent wavefront [87] . Such complex dynamical features require a mathematical description of fractional diffusion equations, and they are also well captured by the networked metapopulations. Colizza et al. [69] developed a global stochastic metapopulation model in, using the data of worldwide scheduled flights and census populations to establish a complete worldwide air transportation network (more than 3000 airports). They studied the predictability and the reliability of the pandemic forecast with respect to the intrinsic stochasticity, and declared that the topological heterogeneity reduces the predictability, whereas author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 269, 'end': 273, 'text': '[87]', 'ref_id': 'BIBREF85'}, {'start': 459, 'end': 463, 'text': '[69]', 'ref_id': 'BIBREF68'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/003889 doi: bioRxiv preprint the high-level heterogeneity of traffic flows improves the pandemic predictability.', 'cite_spans': [], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'As illustrated by Fig. 3 a, air traffic network acts as a major channel serving human long-range travels, which mediates the pandemic transmission on a large geographic scale. The epidemic dynamics occurred under this scenario is well characterized by the reaction-diffusion processes [88] , which are also widely applied to model phenomena as diverse as genetic drift, chemical reactions, and population evolution [2] .', 'cite_spans': [{'start': 285, 'end': 289, 'text': '[88]', 'ref_id': 'BIBREF86'}, {'start': 415, 'end': 418, 'text': '[2]', 'ref_id': 'BIBREF1'}], 'ref_spans': [{'start': 18, 'end': 24, 'text': 'Fig. 3', 'ref_id': 'FIGREF3'}], 'section': 'Inter-subpopulation invasion.'}, {'text': 'From a theoretical viewpoint, it is significant to analyze the epidemic threshold, which is instructive for the assessment of the disease transmissibility as well as the outbreak potential. Such information is also important to regulate the implementation of intervention strategies. Based on the empirical evidences that the topology of various socio-technical networks including the airline network presents a high-level heterogeneity, Colizza et al. [88] studied the effect of general heterogeneous networks, demonstrating that the epidemic threshold is significantly decreased with the augmentation of topological fluctuations. Considering that the theory developed in Colizza et al. [88] is based on the simplification that individual diffusion rate per subpopulation is inversely proportional to the degree of subpopulations, Colizza and Vespignani [89] generalized the study by introducing more realistic diffusion rules, such as the traffic-and the population-dependent patterns. Importantly, using the approach of branching process, Colizza and Vespignani [89] proposed a global invasion threshold, R ⋆ , which distinguishes the lower bound condition for transmitting the infections to downstream unaffected subpopulations. The formula of R ⋆ author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 453, 'end': 457, 'text': '[88]', 'ref_id': 'BIBREF86'}, {'start': 688, 'end': 692, 'text': '[88]', 'ref_id': 'BIBREF86'}, {'start': 855, 'end': 859, 'text': '[89]', 'ref_id': 'BIBREF88'}, {'start': 1065, 'end': 1069, 'text': '[89]', 'ref_id': 'BIBREF88'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/003889 doi: bioRxiv preprint can be summarized as R ⋆ = E(R 0 , µ)T (k, θ, ω 0 ), which combines the epidemiology factors E(R 0 , µ) with the diffusion properties of mobility networks T (k, θ, ω 0 ). For large networks with a high-level topological heterogeneity, the mobility item T diverges, i.e., T −1 → 0, thus R ⋆ is always larger than unity, which leads to a decreased epidemic threshold. Based on the observation that human beings usually do not perform random walks, yet have specific travel destinations, Tang et al. [90] addressed the effect of objective traveling behavior which enlarges the final morbidity.', 'cite_spans': [{'start': 628, 'end': 632, 'text': '[90]', 'ref_id': 'BIBREF89'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'The above studies mainly concern the influence of human random diffusion, usually defining the mobility scheme as a Markovian memoryless diffusive process [91] . Recent empirical findings on human mobility have shown the crucial role of commuting mobility in human daily transportation, which is reflected by the individual recurrent movement between frequently visited locations such as household, school, and workplace [92] [93] [94] [95] . Fig. 3b visualizes the US commuting network with the census data on commuting trips between counties [96] . Evidently, the structural features are different between the commuting network and the air transportation network.', 'cite_spans': [{'start': 155, 'end': 159, 'text': '[91]', 'ref_id': 'BIBREF90'}, {'start': 421, 'end': 425, 'text': '[92]', 'ref_id': 'BIBREF91'}, {'start': 426, 'end': 430, 'text': '[93]', 'ref_id': 'BIBREF92'}, {'start': 431, 'end': 435, 'text': '[94]', 'ref_id': 'BIBREF93'}, {'start': 436, 'end': 440, 'text': '[95]', 'ref_id': 'BIBREF94'}, {'start': 544, 'end': 548, 'text': '[96]', 'ref_id': None}], 'ref_spans': [{'start': 443, 'end': 450, 'text': 'Fig. 3b', 'ref_id': 'FIGREF3'}], 'section': 'Inter-subpopulation invasion.'}, {'text': 'It might be infeasible to analyze the non-Markovian properties of human commuting with previous reaction-diffusion theory. In this regard, Balcan and Vespignani [91] extended the metapopulation framework by considering the impact of human recurrent commuting, which assumes that individuals remember their subpopulations of residence, with a constraint that commuters staying at their destination subpopulations cannot continue moving to other places but return to the residences with a certain rate. The approach of timescale separation is applied to perform theoretical analysis, since in reality the number of frequent commuters only accounts for a small fraction of local populations. This leads to a mean-field description of stationary populations distribution. Generalizing the theory of branching process, Balcan and Vespignani [97] obtained the global invasion threshold for the reaction-commuting networked metapopulation systems, which establishes a new threshold relevant to the typical visiting duration of commuters. With a high return rate, the sojourn time (i.e., length of stay) of infected commuters might be too short to transmit the infection to susceptibles in adjacent unaffected subpopulations.', 'cite_spans': [{'start': 161, 'end': 165, 'text': '[91]', 'ref_id': 'BIBREF90'}, {'start': 836, 'end': 840, 'text': '[97]', 'ref_id': 'BIBREF96'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'To study the dynamical differences between the reactioncommuting and the reaction-diffusion processes, Belik et al. [98] analyzed their respective traveling wave solutions on the one dimensional lattice. As the diffusion rate increases, spatially constrained human commuting generates a saturated threshold of the wave front velocity, whereas the reactiondiffusion model has an unbounded front velocity threshold. Such distinction implies that the estimation of transmission speed might be overestimated under the reaction-diffusion framework. Besides, they have also found that the character-istic sojourn time spent by commuters induces a novel epidemic threshold. Since airline traffic and ground commuting networks both serve human routine transportation, Balcan et al. [94] developed a multiscale networked metapopulation model, where the commuting networks in about 30 countries were embedded into the worldwide long-range air transportation network. The introduction of short-range commuting mobility enhances the synchronization of epidemic evolution profiles for subpopulations in close geographical proximity.', 'cite_spans': [{'start': 116, 'end': 120, 'text': '[98]', 'ref_id': 'BIBREF97'}, {'start': 774, 'end': 778, 'text': '[94]', 'ref_id': 'BIBREF93'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'Human beings are intelligent. Their risk perception and adaptive abilities promote the active response to epidemic outbreaks, which might in turn alter the disease propagation [99] [100] [101] . Many works [102] [103] [104] [105] [106] [107] [108] [109] [110] [111] have investigated the effect of disease-behavior mutual feedback on compartment models as well as network epidemiology, and recent research topics also begin the generalization to deal with human behavior of mobility response. For example, [112, 113] analyzed the impact of self-initiated mobility on the invasion threshold, showing a counterintuitive phenomenon that the mobility change of avoiding infected locations with high prevalences enhances the disease spreading to the entire system.', 'cite_spans': [{'start': 176, 'end': 180, 'text': '[99]', 'ref_id': 'BIBREF98'}, {'start': 181, 'end': 186, 'text': '[100]', 'ref_id': 'BIBREF99'}, {'start': 187, 'end': 192, 'text': '[101]', 'ref_id': 'BIBREF100'}, {'start': 206, 'end': 211, 'text': '[102]', 'ref_id': 'BIBREF101'}, {'start': 212, 'end': 217, 'text': '[103]', 'ref_id': 'BIBREF102'}, {'start': 218, 'end': 223, 'text': '[104]', 'ref_id': 'BIBREF103'}, {'start': 224, 'end': 229, 'text': '[105]', 'ref_id': 'BIBREF104'}, {'start': 230, 'end': 235, 'text': '[106]', 'ref_id': 'BIBREF105'}, {'start': 236, 'end': 241, 'text': '[107]', 'ref_id': 'BIBREF106'}, {'start': 242, 'end': 247, 'text': '[108]', 'ref_id': 'BIBREF107'}, {'start': 248, 'end': 253, 'text': '[109]', 'ref_id': 'BIBREF108'}, {'start': 254, 'end': 259, 'text': '[110]', 'ref_id': 'BIBREF109'}, {'start': 260, 'end': 265, 'text': '[111]', 'ref_id': 'BIBREF110'}, {'start': 506, 'end': 511, 'text': '[112,', 'ref_id': 'BIBREF111'}, {'start': 512, 'end': 516, 'text': '113]', 'ref_id': 'BIBREF112'}], 'ref_spans': [], 'section': 'Inter-subpopulation invasion.'}, {'text': 'The above studies focus on understanding the influence of inter-subpopulation human mobility patterns, generally assuming that the individuals behave identically in each subpopulation. However, the diversity of individual behaviors in different subpopulations also affects the pandemic spreading.', 'cite_spans': [], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'Although it is well-known that human contacts have crucial impact on the spatiotemporal dynamics of infectious diseases in a population [3] , previous works assumed that individual contact patterns are identical among all subpopulations. Since the basic reproductive number, R 0 , is equivalent to the same constant in all subpopulations, it is predictable that the epidemic attack rates as well as evolution profiles in different areas are similar, as one can clearly observe in [114] .', 'cite_spans': [{'start': 136, 'end': 139, 'text': '[3]', 'ref_id': 'BIBREF2'}, {'start': 480, 'end': 485, 'text': '[114]', 'ref_id': 'BIBREF113'}], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'At the intra-subpopulation scale, aside from the empirical support from the data analysis of intra-urban human mobility (see Section 3), the feasibility of the \"well-mixed\" contacts assumption is also consistent with the recent findings on interactive patterns of human contact. For example, diverse digital instruments, e.g., wireless sensors [115] , active Radio Frequency Identification (RFID) devices [116, 117] , and WiFi [118] [119] [120] (we resort to the WiFi technology in our social experiments, due to its ubiquity in urban areas), have been deployed in realistic social circumstances to collect the data of human close proximity contacts [121] . The data analyses have unveiled an unexpected feature that the squared coefficient of variance is quite small for the distribution of the number of distinct persons each individual author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 344, 'end': 349, 'text': '[115]', 'ref_id': 'BIBREF114'}, {'start': 405, 'end': 410, 'text': '[116,', 'ref_id': 'BIBREF115'}, {'start': 411, 'end': 415, 'text': '117]', 'ref_id': 'BIBREF116'}, {'start': 427, 'end': 432, 'text': '[118]', 'ref_id': 'BIBREF117'}, {'start': 433, 'end': 438, 'text': '[119]', 'ref_id': 'BIBREF118'}, {'start': 439, 'end': 444, 'text': '[120]', 'ref_id': 'BIBREF119'}, {'start': 650, 'end': 655, 'text': '[121]', 'ref_id': 'BIBREF120'}], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/003889 doi: bioRxiv preprint Origin-driven contact scenario g g Fig. 4 (Color online) Effect of location-specific human contact patterns. a-b illustrate the structure of the phenomenological metapopulation model used in [124] , where the reaction-commuting processes couple two typical subpopulations x,y. In the destination-driven scenario (a), individual characteristic contact rates (cx, cy) depend on the visited locations, while in the origin-driven scenario (b), the contacts of individuals correlate to their subpopulations of residence. c-d present the phase diagrams of the global R g 0 under these two scenarios, respectively. The white dashed curve in each panel shows the global threshold R g 0 = 1 obtained through the NGM analysis. From Ref. [124] .', 'cite_spans': [{'start': 322, 'end': 327, 'text': '[124]', 'ref_id': 'BIBREF123'}, {'start': 858, 'end': 863, 'text': '[124]', 'ref_id': 'BIBREF123'}], 'ref_spans': [{'start': 166, 'end': 172, 'text': 'Fig. 4', 'ref_id': None}], 'section': 'Intra-subpopulation contagion.'}, {'text': 'encounters per day [115] [116] [117] [118] [119] , which implies the presence of a characteristic contact rate within each subpopulation.', 'cite_spans': [{'start': 19, 'end': 24, 'text': '[115]', 'ref_id': 'BIBREF114'}, {'start': 25, 'end': 30, 'text': '[116]', 'ref_id': 'BIBREF115'}, {'start': 31, 'end': 36, 'text': '[117]', 'ref_id': 'BIBREF116'}, {'start': 37, 'end': 42, 'text': '[118]', 'ref_id': 'BIBREF117'}, {'start': 43, 'end': 48, 'text': '[119]', 'ref_id': 'BIBREF118'}], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'Note that the characteristic contact rate might vary evidently in different subpopulations. As illustrated by empirical studies [122, 123] , in reality, location-specific factors are the potential drivers resulting in a substantial variation of disease incidences between populations. Inspired by this finding, Wang et al. [124, 125] introduced two categories of location-specific human contact patterns into a phenomenological reaction-commuting metapopulation model. A simple destination-driven scenario is considered first, where individual contact features are determined by the visited locations. Since the residence and the destination can be distinguished by the commuting mobility, an origin-driven scenario is also introduced, where the contacts of individuals are relevant to their subpopulations of residence. Figures 4(a) -(b) illustrate the modeling structures of these two scenarios.', 'cite_spans': [{'start': 128, 'end': 133, 'text': '[122,', 'ref_id': 'BIBREF121'}, {'start': 134, 'end': 138, 'text': '123]', 'ref_id': 'BIBREF122'}, {'start': 323, 'end': 328, 'text': '[124,', 'ref_id': 'BIBREF123'}, {'start': 329, 'end': 333, 'text': '125]', 'ref_id': 'BIBREF124'}], 'ref_spans': [{'start': 821, 'end': 833, 'text': 'Figures 4(a)', 'ref_id': None}], 'section': 'Intra-subpopulation contagion.'}, {'text': 'In these cases, it is infeasible to analyze the invasion threshold through the theory of branching process, since the prerequisite of identical basic reproductive number in all subpopulations is invalid. Instead, the next generation matrix (NGM) approach [126] can be applied to analyze the global outbreak threshold R g 0 here. Due to the mixing of individuals with heterogeneous contact capacities in each subpopulation, which is analogous to the effect induced by annealed heterogeneous networks [45] , the addressed locationspecific contact patterns reduce the epidemic threshold significantly, and thus favor disease outbreaks in contrast to the traditional homogeneous cases. Figs. 4 c-d show the phase diagrams of the global R g 0 under these two types of contact patterns, respectively. Interestingly, the variance of disease prevalence under the destination-driven scenario has a monotonic dependence on the characteristic contact rates, whereas under the origin-driven scenario, counterintuitively, the increase of contact rates weakens the disease prevalence in some parametric ranges. This topic was also extended to study the metapopulation network, which unraveled a new problem of disease localization, i.e., the epidemic might be localized on a finite number of highly connected hubs. author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 255, 'end': 260, 'text': '[126]', 'ref_id': 'BIBREF125'}, {'start': 499, 'end': 503, 'text': '[45]', 'ref_id': 'BIBREF44'}], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/003889 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'Other types of human behavioral diversity have also been considered recently. Motivated by the evidence that the diversity of travel habits or trip durations might yield heterogeneity in the sojourn time spent at destinations, Poletto et al. [127] studied the impact of large fluctuations of visiting durations on the epidemic threshold, finding that the positively-correlated and the negatively-correlated degreebased staying durations lead to distinct invasion paths to global outbreaks. Based on the observation that the specific curing (recovery) condition depends on the available medical resources supplied by local health sectors, Shen et al. [128] studied the effect of degree-dependent curing rates, which demonstrates that an optimal intervention performance with the largest epidemic threshold is obtained by designing the heterogeneous distribution of curing rates as a superlinear mode. Since the epidemic spreading is also relevant to casual contacts during public gatherings, Cao et al. [129] introduced the rendezvous effect into a bipartite metapopulation network, and showed that the rendezvous-induced transmission accelerates the pandemic outbreaks.', 'cite_spans': [{'start': 242, 'end': 247, 'text': '[127]', 'ref_id': 'BIBREF126'}, {'start': 650, 'end': 655, 'text': '[128]', 'ref_id': 'BIBREF127'}, {'start': 1002, 'end': 1007, 'text': '[129]', 'ref_id': 'BIBREF128'}], 'ref_spans': [], 'section': 'Intra-subpopulation contagion.'}, {'text': 'The study of metapopulation model not only expands our knowledge on the dynamics of spatial epidemic spreading, but also manifests the power in evaluating the performance of intervention strategies. For example, although the strategy of travel ban is usually deployed during a pandemic outbreak in reality, it is unclear whether the effectiveness is excellent enough in limiting the pandemic spreading. Counterintuitively, recent studies have unraveled the limited utility of travel restrictions: Even if the worldwide air traffic is decreased to an unprecedented low level, e.g., less than 10 %, the disease landing to unaffected regions is only postponed several weeks [130] [131] [132] [133] ; the contribution to reducing the morbity is also quite limited [130, 131, 134] . Such findings are consistent with the aforementioned fact that the global invasion threshold is decreased significantly by the presence of the high-level topological heterogeneity.', 'cite_spans': [{'start': 671, 'end': 676, 'text': '[130]', 'ref_id': 'BIBREF129'}, {'start': 677, 'end': 682, 'text': '[131]', 'ref_id': 'BIBREF130'}, {'start': 683, 'end': 688, 'text': '[132]', 'ref_id': 'BIBREF131'}, {'start': 689, 'end': 694, 'text': '[133]', 'ref_id': 'BIBREF132'}, {'start': 760, 'end': 765, 'text': '[130,', 'ref_id': 'BIBREF129'}, {'start': 766, 'end': 770, 'text': '131,', 'ref_id': 'BIBREF130'}, {'start': 771, 'end': 775, 'text': '134]', 'ref_id': 'BIBREF133'}], 'ref_spans': [], 'section': 'Performance of intervention strategies'}, {'text': 'It thus becomes urgent to study the controllability of intra-subpopulation measures, such as the usage of vaccine or antiviral drugs, and the implementation of communitybased interventions, which are typical containment strategies suggested by the World Health Organization (WHO) [55] . To estimate and also to improve the performance of disease response plans on decreasing the morbidity, large-scale computational simulations have been performed extensively to study various types of pharmaceutical interventions [4, 14, 56, 57, 60, 68, [134] [135] [136] [137] [138] [139] , which aid in identifying the targetedgroups and guiding the deployment of limited resources.', 'cite_spans': [{'start': 280, 'end': 284, 'text': '[55]', 'ref_id': 'BIBREF54'}, {'start': 515, 'end': 518, 'text': '[4,', 'ref_id': 'BIBREF3'}, {'start': 519, 'end': 522, 'text': '14,', 'ref_id': 'BIBREF13'}, {'start': 523, 'end': 526, 'text': '56,', 'ref_id': 'BIBREF55'}, {'start': 527, 'end': 530, 'text': '57,', 'ref_id': 'BIBREF56'}, {'start': 531, 'end': 534, 'text': '60,', 'ref_id': 'BIBREF59'}, {'start': 535, 'end': 538, 'text': '68,', 'ref_id': 'BIBREF67'}, {'start': 539, 'end': 544, 'text': '[134]', 'ref_id': 'BIBREF133'}, {'start': 545, 'end': 550, 'text': '[135]', 'ref_id': 'BIBREF134'}, {'start': 551, 'end': 556, 'text': '[136]', 'ref_id': 'BIBREF136'}, {'start': 557, 'end': 562, 'text': '[137]', 'ref_id': 'BIBREF137'}, {'start': 563, 'end': 568, 'text': '[138]', 'ref_id': 'BIBREF138'}, {'start': 569, 'end': 574, 'text': '[139]', 'ref_id': 'BIBREF139'}], 'ref_spans': [], 'section': 'Performance of intervention strategies'}, {'text': 'Despite technical difficulties, it is probable to analyze the delaying effect of different strategies. With the theory of renewal process, Wang et al. [140] developed a general mathematical framework to deal with the scenario of minimum metapopulation, where two typical subpopulations are connected by the travel flows. This is a rational approximation of the initial stage of an outbreak. It is shown that with a short response time, the intra-subpopulation measures perform much better than that of the inter-subpopulation travel restrictions. However, this advantage is weakened considerably as the response time increases.', 'cite_spans': [{'start': 151, 'end': 156, 'text': '[140]', 'ref_id': 'BIBREF140'}], 'ref_spans': [], 'section': 'Performance of intervention strategies'}, {'text': 'Recent clinical evidences obtained from the real-world pandemic campaigns have uncovered new problems on the prompt response with pharmaceutical interventions. For example, there presents an unavoidable delay of 4-6 months for developing the proper vaccine against a particular pandemic virus [141] [142] [143] ; and an extensive usage of antiviral drugs might induce the prevalence of antiviral resistance [144] [145] [146] . Therefore, it is crucial to thoroughly examine the effectiveness of community-based interventions by using the models of networked metapopulation, which deserves more efforts in near future.', 'cite_spans': [{'start': 293, 'end': 298, 'text': '[141]', 'ref_id': 'BIBREF141'}, {'start': 299, 'end': 304, 'text': '[142]', 'ref_id': 'BIBREF142'}, {'start': 305, 'end': 310, 'text': '[143]', 'ref_id': 'BIBREF143'}, {'start': 407, 'end': 412, 'text': '[144]', 'ref_id': 'BIBREF144'}, {'start': 413, 'end': 418, 'text': '[145]', 'ref_id': 'BIBREF145'}, {'start': 419, 'end': 424, 'text': '[146]', 'ref_id': 'BIBREF146'}], 'ref_spans': [], 'section': 'Performance of intervention strategies'}, {'text': 'Networked metapopulation contributes an ideal epidemic modeling platform, which promotes our understanding on the dynamics of large-scale geographic transmission of emergent diseases. The models have the potential to be applied in the real-time numerical pandemic forecast, and are also very useful in evaluating the effectiveness of disease response strategies.', 'cite_spans': [], 'ref_spans': [], 'section': 'Conclusions & outlooks'}, {'text': 'Recently, the good, the bad and the ugly facts of the Big Data have triggered extensive debates around the world. The interdisciplinary research of metapopulation epidemiology establishes a paradigm for the study of data science, since one remarkable progress in this field is the innovative usage of fine-grained data in verifying key assumptions and in establishing model substrates. Technical developments in the data collection, processing and analysis not only offer key insights into the dynamical properties of human mobility infrastructures as well as human behavioral diversity, but also raise new questions referring to their influences on the spatial transmission of emerging infectious diseases. Such methodology can be applied to study diverse types of contagion phenomena, including the spreading of computer viruses, information, innovations, emotion, behavior, crisis, culture, etc.', 'cite_spans': [], 'ref_spans': [], 'section': 'Conclusions & outlooks'}, {'text': 'At the end of discussions, some open questions still deserve to be addressed. The development of the sophisticated computational techniques and the consideration of detailed human/population dynamics are quite important for the research of spatial epidemiology. However, it is also crucial to understand the fundamental principals governing the complex contagion phenomena [147] . In this regard, an interesting question poses itself, namely, whether it is possible to author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 373, 'end': 378, 'text': '[147]', 'ref_id': 'BIBREF147'}], 'ref_spans': [], 'section': 'Conclusions & outlooks'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/003889 doi: bioRxiv preprint define a unified mathematical framework that can characterize different kinds of spatial dynamics models of emerging diseases.', 'cite_spans': [], 'ref_spans': [], 'section': 'Conclusions & outlooks'}, {'text': 'It is also probable to generalize present theoretical results to deal with reverse problems, such as the identification of infection sources [147] [148] [149] , possible mobility networks [150] , and disease invasion process. Such inference problems are valuable to establish an optimal response plan for tracing and preventing the pandemics.', 'cite_spans': [{'start': 141, 'end': 146, 'text': '[147]', 'ref_id': 'BIBREF147'}, {'start': 147, 'end': 152, 'text': '[148]', 'ref_id': 'BIBREF148'}, {'start': 153, 'end': 158, 'text': '[149]', 'ref_id': 'BIBREF149'}, {'start': 188, 'end': 193, 'text': '[150]', 'ref_id': 'BIBREF150'}], 'ref_spans': [], 'section': 'Conclusions & outlooks'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': 'Some demographic and genetic consequences of environmental heterogeneity for biological control', 'authors': [{'first': 'R', 'middle': [], 'last': 'Levins', 'suffix': ''}], 'year': 1969, 'venue': 'Bull Entomol Soc Am', 'volume': '15', 'issn': '', 'pages': '237--240', 'other_ids': {}}, 'BIBREF1': {'ref_id': 'b1', 'title': 'Ecology, Genetics and Evolution of Metapopulations', 'authors': [], 'year': 2004, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'Infectious diseases of humans: Dynamics and control', 'authors': [{'first': 'R M', 'middle': [], 'last': 'Anderson', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'May', 'suffix': ''}], 'year': 1991, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Modeling infectious diseases in humans and animals', 'authors': [{'first': 'M J', 'middle': [], 'last': 'Keeling', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Rohani', 'suffix': ''}], 'year': 2008, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF4': {'ref_id': 'b4', 'title': \"Bernoulli's epidemiological model revisited\", 'authors': [{'first': 'K', 'middle': [], 'last': 'Dietz', 'suffix': ''}, {'first': 'J A P', 'middle': [], 'last': 'Heesterbeek', 'suffix': ''}, {'first': '', 'middle': [], 'last': 'Daniel', 'suffix': ''}], 'year': 2002, 'venue': 'Math Biosci', 'volume': '180', 'issn': '', 'pages': '1--21', 'other_ids': {}}, 'BIBREF5': {'ref_id': 'b5', 'title': 'Statistical mechanics of complex networks', 'authors': [{'first': 'R', 'middle': [], 'last': 'Albert', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barabási', 'suffix': ''}], 'year': 2002, 'venue': 'Rev Mod Phys', 'volume': '74', 'issn': '', 'pages': '47--97', 'other_ids': {}}, 'BIBREF6': {'ref_id': 'b6', 'title': 'Complex networks: Structure, robustness and function', 'authors': [{'first': 'R', 'middle': [], 'last': 'Cohen', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Havlin', 'suffix': ''}], 'year': 2010, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'Networks: An introduction', 'authors': [{'first': 'M E J', 'middle': [], 'last': 'Newman', 'suffix': ''}], 'year': 2010, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF8': {'ref_id': 'b8', 'title': 'Introduction to complex networks: Models, structures, and dynamics', 'authors': [{'first': 'G R', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'X F', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}], 'year': 2012, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'Collective dynamics of small-world networks', 'authors': [{'first': 'D J', 'middle': [], 'last': 'Watts', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Strogatz', 'suffix': ''}], 'year': 1998, 'venue': 'Nature', 'volume': '393', 'issn': '', 'pages': '440--442', 'other_ids': {}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Emergence of scaling in random networks', 'authors': [{'first': 'A L', 'middle': [], 'last': 'Barabási', 'suffix': ''}, {'first': 'Albert', 'middle': ['R'], 'last': '', 'suffix': ''}], 'year': 1999, 'venue': 'Science', 'volume': '286', 'issn': '', 'pages': '509--512', 'other_ids': {}}, 'BIBREF11': {'ref_id': 'b11', 'title': 'Complex networks: Structure and dynamics', 'authors': [{'first': 'S', 'middle': [], 'last': 'Boccaletti', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Latora', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Moreno', 'suffix': ''}], 'year': 2006, 'venue': 'Phys Rep', 'volume': '424', 'issn': '', 'pages': '175--308', 'other_ids': {}}, 'BIBREF12': {'ref_id': 'b12', 'title': 'Critical phenomena in complex networks', 'authors': [{'first': 'S N', 'middle': [], 'last': 'Dorogovtsev', 'suffix': ''}, {'first': 'A V', 'middle': [], 'last': 'Goltsev', 'suffix': ''}, {'first': 'J F F', 'middle': [], 'last': 'Mendes', 'suffix': ''}], 'year': 2008, 'venue': 'Rev Mod Phys', 'volume': '80', 'issn': '', 'pages': '1275--1335', 'other_ids': {}}, 'BIBREF13': {'ref_id': 'b13', 'title': 'Dynamical processes on complex networks', 'authors': [{'first': 'A', 'middle': [], 'last': 'Barrat', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Barthélemy', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2008, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF14': {'ref_id': 'b14', 'title': 'Statistical physics of social dynamics', 'authors': [{'first': 'C', 'middle': [], 'last': 'Castellano', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Fortunato', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Loreto', 'suffix': ''}], 'year': 2009, 'venue': 'Rev Mod Phys', 'volume': '81', 'issn': '', 'pages': '591--646', 'other_ids': {}}, 'BIBREF15': {'ref_id': 'b15', 'title': 'Error and attack tolerance of complex networks', 'authors': [{'first': 'R', 'middle': [], 'last': 'Albert', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Jeong', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barabási', 'suffix': ''}], 'year': 2000, 'venue': 'Nature', 'volume': '406', 'issn': '', 'pages': '378--382', 'other_ids': {}}, 'BIBREF16': {'ref_id': 'b16', 'title': 'Networks formed from interdependent networks', 'authors': [{'first': 'J X', 'middle': [], 'last': 'Gao', 'suffix': ''}, {'first': 'S V', 'middle': [], 'last': 'Buldyrev', 'suffix': ''}, {'first': 'H E', 'middle': [], 'last': 'Stanley', 'suffix': ''}], 'year': 2012, 'venue': 'Nat Phys', 'volume': '8', 'issn': '', 'pages': '40--48', 'other_ids': {}}, 'BIBREF17': {'ref_id': 'b17', 'title': 'Pinning a complex dynamical network to its equilibrium', 'authors': [{'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'X F', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'G', 'middle': ['R'], 'last': 'Chen', 'suffix': ''}], 'year': 2004, 'venue': 'IEEE Trans Circuits Syst I', 'volume': '51', 'issn': '', 'pages': '2074--2087', 'other_ids': {}}, 'BIBREF18': {'ref_id': 'b18', 'title': 'Network synchronizability analysis: A graph-theoretic approach', 'authors': [{'first': 'G R', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Duan', 'suffix': ''}], 'year': 2008, 'venue': 'Chaos', 'volume': '18', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF19': {'ref_id': 'b19', 'title': 'Synchronization in complex networks', 'authors': [{'first': 'A', 'middle': [], 'last': 'Arenas', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Díaz-Guilera', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Kurths', 'suffix': ''}], 'year': 2008, 'venue': 'Phys Rep', 'volume': '469', 'issn': '', 'pages': '93--153', 'other_ids': {}}, 'BIBREF20': {'ref_id': 'b20', 'title': 'A guide to first-passage processes', 'authors': [{'first': 'S', 'middle': [], 'last': 'Redner', 'suffix': ''}], 'year': 2001, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF21': {'ref_id': 'b21', 'title': 'Heterogeneous voter models', 'authors': [{'first': 'N', 'middle': [], 'last': 'Masuda', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Gibert', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Redner', 'suffix': ''}], 'year': 2010, 'venue': 'Phys Rev E', 'volume': '82', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF22': {'ref_id': 'b22', 'title': 'Consensus of sampled-data multi-agent networking systems via model predictive control', 'authors': [{'first': 'J Y', 'middle': [], 'last': 'Zhan', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}], 'year': 2013, 'venue': 'Automatica', 'volume': '49', 'issn': '', 'pages': '2502--2507', 'other_ids': {}}, 'BIBREF23': {'ref_id': 'b23', 'title': 'Freezing period strongly impacts the emergence of a global consensus in the voter model', 'authors': [{'first': 'Z', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2014, 'venue': 'Sci Rep', 'volume': '4', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF24': {'ref_id': 'b24', 'title': 'Control and flocking of networked systems via pinning', 'authors': [{'first': 'X F', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Lv', 'suffix': ''}], 'year': 2010, 'venue': 'IEEE Circ Syst Mag', 'volume': '10', 'issn': '', 'pages': '83--91', 'other_ids': {}}, 'BIBREF25': {'ref_id': 'b25', 'title': 'Controllability of complex networks', 'authors': [{'first': 'Y Y', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'J J', 'middle': [], 'last': 'Slotine', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barabási', 'suffix': ''}], 'year': 2011, 'venue': 'Nature', 'volume': '473', 'issn': '', 'pages': '167--173', 'other_ids': {}}, 'BIBREF26': {'ref_id': 'b26', 'title': 'Controlling complex networks: How much energy is needed', 'authors': [{'first': 'G', 'middle': [], 'last': 'Yan', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Ren', 'suffix': ''}, {'first': 'Y C', 'middle': [], 'last': 'Lai', 'suffix': ''}], 'year': 2012, 'venue': 'Phys Rev Lett', 'volume': '108', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF27': {'ref_id': 'b27', 'title': 'Exact controllability of complex networks', 'authors': [{'first': 'Z Z', 'middle': [], 'last': 'Yuan', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Zhao', 'suffix': ''}, {'first': 'Z R', 'middle': [], 'last': 'Di', 'suffix': ''}], 'year': 2013, 'venue': 'Nat Commun', 'volume': '', 'issn': '4', 'pages': '', 'other_ids': {}}, 'BIBREF28': {'ref_id': 'b28', 'title': 'Five rules for the evolution of cooperation', 'authors': [{'first': 'M', 'middle': [], 'last': 'Nowak', 'suffix': ''}], 'year': 2006, 'venue': 'Science', 'volume': '314', 'issn': '', 'pages': '1560--1563', 'other_ids': {}}, 'BIBREF29': {'ref_id': 'b29', 'title': 'Evolutionary games on graphs', 'authors': [{'first': 'G', 'middle': [], 'last': 'Szabó', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Fáth', 'suffix': ''}], 'year': 2007, 'venue': 'Phys Rep', 'volume': '446', 'issn': '', 'pages': '97--216', 'other_ids': {}}, 'BIBREF30': {'ref_id': 'b30', 'title': 'Roles of mixing patterns in cooperation on a scale-free networked game', 'authors': [{'first': 'Z H', 'middle': [], 'last': 'Rong', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2007, 'venue': 'Phys Rev E', 'volume': '76', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF31': {'ref_id': 'b31', 'title': 'Inferring reputation promotes the evolution of cooperation in spatial social dilemma games', 'authors': [{'first': 'Z', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Z Y', 'middle': [], 'last': 'Yin', 'suffix': ''}], 'year': 2012, 'venue': 'Plos One', 'volume': '7', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF32': {'ref_id': 'b32', 'title': 'Interdependent network reciprocity in evolutionary games', 'authors': [{'first': 'Z', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Szolnoki', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Perc', 'suffix': ''}], 'year': 2013, 'venue': 'Sci Rep', 'volume': '', 'issn': '3', 'pages': '', 'other_ids': {}}, 'BIBREF33': {'ref_id': 'b33', 'title': 'Noise-induced enhancement of network reciprocity in social dilemmas', 'authors': [{'first': 'G Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'Q B', 'middle': [], 'last': 'Sun', 'suffix': ''}, {'first': '', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2013, 'venue': 'Chaos Soliton Fract', 'volume': '51', 'issn': '', 'pages': '31--35', 'other_ids': {}}, 'BIBREF34': {'ref_id': 'b34', 'title': 'Evolution and maintenance of cooperation via inheritance of neighborhood relationship', 'authors': [{'first': 'S L', 'middle': [], 'last': 'Tan', 'suffix': ''}, {'first': 'J H', 'middle': [], 'last': 'Lv', 'suffix': ''}, {'first': 'X H', 'middle': [], 'last': 'Yu', 'suffix': ''}], 'year': 2013, 'venue': 'Chin Sci Bull', 'volume': '58', 'issn': '', 'pages': '3491--3498', 'other_ids': {}}, 'BIBREF35': {'ref_id': 'b35', 'title': 'Spontaneous symmetry breaking in interdependent networked game', 'authors': [{'first': 'Q', 'middle': [], 'last': 'Jin', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'C Y', 'middle': [], 'last': 'Xia', 'suffix': ''}], 'year': 2014, 'venue': 'Sci Rep', 'volume': '4', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF36': {'ref_id': 'b36', 'title': 'General dynamics of topology and traffic on weighted technological networks', 'authors': [{'first': 'W X', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'B H', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Hu', 'suffix': ''}], 'year': 2005, 'venue': 'Phys Rev Lett', 'volume': '94', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF37': {'ref_id': 'b37', 'title': 'Traffic-driven epidemic spreading in finite-size scale-free networks', 'authors': [{'first': 'S', 'middle': [], 'last': 'Meloni', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Arenas', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Moreno', 'suffix': ''}], 'year': 2009, 'venue': 'Proc Natl Acad Sci', 'volume': '106', 'issn': '', 'pages': '16897--16902', 'other_ids': {}}, 'BIBREF38': {'ref_id': 'b38', 'title': 'Analysis of communication network performance from a complex network perspective', 'authors': [{'first': 'J', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'C K', 'middle': [], 'last': 'Tse', 'suffix': ''}, {'first': 'F C M', 'middle': [], 'last': 'Lau', 'suffix': ''}], 'year': 2013, 'venue': 'IEEE Trans Circuits Syst I', 'volume': '60', 'issn': '', 'pages': '3303--3316', 'other_ids': {}}, 'BIBREF39': {'ref_id': 'b39', 'title': 'Analysis of self-organized criticality in weighted coupled systems', 'authors': [{'first': 'G Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'T', 'middle': ['L'], 'last': 'Chen', 'suffix': ''}], 'year': 2009, 'venue': 'Physica A', 'volume': '388', 'issn': '', 'pages': '1249--1256', 'other_ids': {}}, 'BIBREF40': {'ref_id': 'b40', 'title': 'Self organized criticality in a modified Olami-Feder-Christensen model', 'authors': [{'first': 'G Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'U', 'middle': [], 'last': 'Tirnakli', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2011, 'venue': 'Eur Phys J B', 'volume': '82', 'issn': '', 'pages': '83--89', 'other_ids': {}}, 'BIBREF41': {'ref_id': 'b41', 'title': 'Self-organized criticality analysis of earthquake model based on heterogeneous networks', 'authors': [{'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'G Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'T', 'middle': ['L'], 'last': 'Chen', 'suffix': ''}], 'year': 2011, 'venue': 'Commun Theor Phys', 'volume': '55', 'issn': '', 'pages': '89--94', 'other_ids': {}}, 'BIBREF42': {'ref_id': 'b42', 'title': 'Suppressing cascades of load in interdependent networks', 'authors': [{'first': 'C D', 'middle': [], 'last': 'Brummitt', 'suffix': ''}, {'first': 'D'', 'middle': [], 'last': 'Souza', 'suffix': ''}, {'first': 'R M', 'middle': [], 'last': 'Leicht', 'suffix': ''}, {'first': 'E', 'middle': ['A'], 'last': '', 'suffix': ''}], 'year': 2012, 'venue': 'Proc Natl Acad Sci', 'volume': '109', 'issn': '', 'pages': '680--689', 'other_ids': {}}, 'BIBREF43': {'ref_id': 'b43', 'title': 'Evolution and structure of the Internet: A statistical physics approach', 'authors': [{'first': 'R', 'middle': [], 'last': 'Pastor-Satorras', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2004, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF44': {'ref_id': 'b44', 'title': 'Epidemic Spreading in Scale-Free Networks', 'authors': [{'first': 'R', 'middle': [], 'last': 'Pastor-Satorras', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2001, 'venue': 'Phys Rev Lett', 'volume': '86', 'issn': '', 'pages': '3200--3203', 'other_ids': {}}, 'BIBREF45': {'ref_id': 'b45', 'title': 'How viruses spread among computers and people', 'authors': [{'first': 'A L', 'middle': [], 'last': 'Lloyd', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'May', 'suffix': ''}], 'year': 2001, 'venue': 'Science', 'volume': '292', 'issn': '', 'pages': '1316--1317', 'other_ids': {}}, 'BIBREF46': {'ref_id': 'b46', 'title': 'Propagation dynamics on complex networks: Models, methods and stability analysis', 'authors': [{'first': 'X C', 'middle': [], 'last': 'Fu', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Small', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Chen', 'suffix': ''}], 'year': 2014, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF47': {'ref_id': 'b47', 'title': 'Absence of epidemic threshold in scale-free networks with degree correlations', 'authors': [{'first': 'M', 'middle': [], 'last': 'Boguñá', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Pastor-Satorras', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2003, 'venue': 'Phys Rev Lett', 'volume': '90', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF48': {'ref_id': 'b48', 'title': 'Controlling the spreading in small-world evolving networks: Stability, oscillation, and topology', 'authors': [{'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'X', 'middle': ['F'], 'last': 'Wang', 'suffix': ''}], 'year': 2006, 'venue': 'IEEE Trans Automat Control', 'volume': '51', 'issn': '', 'pages': '534--540', 'other_ids': {}}, 'BIBREF49': {'ref_id': 'b49', 'title': 'Epidemic spreading in community networks', 'authors': [{'first': 'Z H', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Hu', 'suffix': ''}], 'year': 2005, 'venue': 'Europhys Lett', 'volume': '72', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF50': {'ref_id': 'b50', 'title': 'Epidemics and immunization on Euclidean distance preferred small-world networks', 'authors': [{'first': 'W P', 'middle': [], 'last': 'Guo', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2007, 'venue': 'Physica A', 'volume': '380', 'issn': '', 'pages': '684--690', 'other_ids': {}}, 'BIBREF51': {'ref_id': 'b51', 'title': 'Competing activation mechanisms in epidemics on networks', 'authors': [{'first': 'R', 'middle': [], 'last': 'Pastor-Satorras', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Castellano', 'suffix': ''}], 'year': 2012, 'venue': 'Sci Rep', 'volume': '2', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF52': {'ref_id': 'b52', 'title': 'Epidemic Threshold for the Susceptible-Infectious-Susceptible Model on Random Networks', 'authors': [{'first': 'R', 'middle': [], 'last': 'Parshani', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Carmi', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Havlin', 'suffix': ''}], 'year': 2010, 'venue': 'Phys Rev Lett', 'volume': '104', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF53': {'ref_id': 'b53', 'title': 'Thresholds for Epidemic Spreading in Networks', 'authors': [{'first': 'C', 'middle': [], 'last': 'Castellano', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Pastor-Satorras', 'suffix': ''}], 'year': 2010, 'venue': 'Phys Rev Lett', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF54': {'ref_id': 'b54', 'title': 'World Health Organization, Pandemic Influenza Preparedness and Response', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF55': {'ref_id': 'b55', 'title': \"Hong Kong's health system-reflections, perspectives and visions\", 'authors': [{'first': 'G M', 'middle': [], 'last': 'Leung', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Bacon-Shone', 'suffix': ''}], 'year': 2006, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF56': {'ref_id': 'b56', 'title': 'SARS: A case study in emerging infections', 'authors': [{'first': 'A R', 'middle': [], 'last': 'Mclean', 'suffix': ''}, {'first': 'R M', 'middle': [], 'last': 'May', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Pattison', 'suffix': ''}], 'year': 2005, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF57': {'ref_id': 'b57', 'title': 'Pandemic Potential of a Strain of Influenza A (H1N1): Early Findings', 'authors': [{'first': 'C', 'middle': [], 'last': 'Fraser', 'suffix': ''}, {'first': 'C A', 'middle': [], 'last': 'Donnelly', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Cauchemez', 'suffix': ''}], 'year': 2009, 'venue': 'Science', 'volume': '324', 'issn': '', 'pages': '1557--1561', 'other_ids': {}}, 'BIBREF58': {'ref_id': 'b58', 'title': 'Spread of a novel influenza A(H1N1) virus via global airline transportation', 'authors': [{'first': 'K', 'middle': [], 'last': 'Khan', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Arino', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Hu', 'suffix': ''}], 'year': 2009, 'venue': 'N Engl J Med', 'volume': '361', 'issn': '', 'pages': '212--214', 'other_ids': {}}, 'BIBREF59': {'ref_id': 'b59', 'title': 'The Transmissibility and Control of Pandemic Influenza A (H1N1) Virus', 'authors': [{'first': 'Y', 'middle': [], 'last': 'Yang', 'suffix': ''}, {'first': 'J D', 'middle': [], 'last': 'Sugimoto', 'suffix': ''}, {'first': 'Elizabeth', 'middle': [], 'last': 'Halloran', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2009, 'venue': 'Science', 'volume': '326', 'issn': '', 'pages': '729--733', 'other_ids': {}}, 'BIBREF60': {'ref_id': 'b60', 'title': 'Global mortality estimates for the 2009 influenza pandemic from the GLaMOR project: A modeling study', 'authors': [{'first': 'L', 'middle': [], 'last': 'Simonsen', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Spreeuwenberg', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Lustig', 'suffix': ''}], 'year': 2013, 'venue': 'PLoS Med', 'volume': '', 'issn': '10', 'pages': '', 'other_ids': {}}, 'BIBREF61': {'ref_id': 'b61', 'title': 'Human infection with a novel avian-origin Influenza A (H7N9) virus', 'authors': [{'first': 'R B', 'middle': [], 'last': 'Gao', 'suffix': ''}], 'year': 2013, 'venue': 'N Engl J Med', 'volume': '368', 'issn': '', 'pages': '1888--1897', 'other_ids': {}}, 'BIBREF62': {'ref_id': 'b62', 'title': 'Human infection with avian influenza A H7N9 virus: an assessment of clinical severity', 'authors': [{'first': 'H', 'middle': [], 'last': 'Yu', 'suffix': ''}, {'first': 'B J', 'middle': [], 'last': 'Cowling', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Feng', 'suffix': ''}], 'year': 2013, 'venue': 'The Lancet', 'volume': '382', 'issn': '', 'pages': '138--145', 'other_ids': {}}, 'BIBREF63': {'ref_id': 'b63', 'title': 'Epidemiological and risk analysis of the H7N9 subtype influenza outbreak in China at its early stage', 'authors': [{'first': 'Q Y', 'middle': [], 'last': 'Zhuang', 'suffix': ''}, {'first': 'S C', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'M L', 'middle': [], 'last': 'Wu', 'suffix': ''}], 'year': 2013, 'venue': 'Chin Sci Bull', 'volume': '58', 'issn': '', 'pages': '3183--3187', 'other_ids': {}}, 'BIBREF64': {'ref_id': 'b64', 'title': 'Isolation and characterization of H7N9 viruses from live poultry markets -Implication of the source of current H7N9 infection in humans', 'authors': [{'first': 'J Z', 'middle': [], 'last': 'Shi', 'suffix': ''}, {'first': 'G H', 'middle': [], 'last': 'Deng', 'suffix': ''}, {'first': 'P H', 'middle': [], 'last': 'Liu', 'suffix': ''}], 'year': 2013, 'venue': 'Chin Sci Bull', 'volume': '58', 'issn': '', 'pages': '1857--1863', 'other_ids': {}}, 'BIBREF65': {'ref_id': 'b65', 'title': 'Environmental and social influences on emerging infectious diseases: past, present and future', 'authors': [{'first': 'A', 'middle': [], 'last': 'Mcmichael', 'suffix': ''}], 'year': 2004, 'venue': 'Phil Trans R Soc Lond B', 'volume': '359', 'issn': '', 'pages': '1049--1058', 'other_ids': {}}, 'BIBREF66': {'ref_id': 'b66', 'title': 'Large-scale spatial-transmission models of infectious disease', 'authors': [{'first': 'S', 'middle': [], 'last': 'Riley', 'suffix': ''}], 'year': 2007, 'venue': 'Science', 'volume': '316', 'issn': '', 'pages': '1298--1301', 'other_ids': {}}, 'BIBREF67': {'ref_id': 'b67', 'title': 'Forecast and control of epidemics in a globalized world', 'authors': [{'first': 'L', 'middle': [], 'last': 'Hufnagel', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Brockmann', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Geisel', 'suffix': ''}], 'year': 2004, 'venue': 'Proc Natl Acad Sci', 'volume': '101', 'issn': '', 'pages': '15124--15129', 'other_ids': {}}, 'BIBREF68': {'ref_id': 'b68', 'title': 'The role of the airline transportation network in the prediction and predictability of global epidemic', 'authors': [{'first': 'V', 'middle': [], 'last': 'Colizza', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barrat', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Barthélemy', 'suffix': ''}], 'year': 2006, 'venue': 'Proc Natl Acad Sci', 'volume': '103', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF69': {'ref_id': 'b69', 'title': 'Evolution of scaling emergence in large-scale spatial epidemic spreading', 'authors': [{'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'Y Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}], 'year': 2011, 'venue': 'PLoS One', 'volume': '6', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF70': {'ref_id': 'b70', 'title': 'Scaling, universality, and renormalization: Three pillars of modern critical phenomena', 'authors': [{'first': 'H', 'middle': ['E'], 'last': 'Stanley', 'suffix': ''}], 'year': 1999, 'venue': 'Rev Mod Phys', 'volume': '71', 'issn': '', 'pages': '358--366', 'other_ids': {}}, 'BIBREF71': {'ref_id': 'b71', 'title': \"Deviation of Zipf's and Heaps' laws in human languages with limited dictionary sizes\", 'authors': [{'first': 'L', 'middle': [], 'last': 'Lv', 'suffix': ''}, {'first': 'Z-K', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Zhou', 'suffix': ''}], 'year': 2013, 'venue': 'Sci Rep', 'volume': '', 'issn': '3', 'pages': '', 'other_ids': {}}, 'BIBREF72': {'ref_id': 'b72', 'title': 'Annual Estimates of the Population of Metropolitan and Micropolitan Statistical Areas', 'authors': [], 'year': 2000, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF73': {'ref_id': 'b73', 'title': 'Spatial networks', 'authors': [{'first': 'M', 'middle': [], 'last': 'Barthélemy', 'suffix': ''}], 'year': 2010, 'venue': 'Phys Rep', 'volume': '499', 'issn': '', 'pages': '1--101', 'other_ids': {}}, 'BIBREF74': {'ref_id': 'b74', 'title': 'Statistical laws in urban mobility from microscopic GPS data in the area of Florence', 'authors': [{'first': 'A', 'middle': [], 'last': 'Bazzani', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Giorgini', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Rambaldi', 'suffix': ''}], 'year': 2010, 'venue': 'J Stat Mech', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF75': {'ref_id': 'b75', 'title': 'Collective human mobility pattern from taxi trips in urban area', 'authors': [{'first': 'C B', 'middle': [], 'last': 'Peng', 'suffix': ''}, {'first': 'X G', 'middle': [], 'last': 'Jin', 'suffix': ''}, {'first': 'K C', 'middle': [], 'last': 'Wong', 'suffix': ''}], 'year': 2012, 'venue': 'PLoS One', 'volume': '7', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF76': {'ref_id': 'b76', 'title': 'Spatiotemporal patterns of urban human mobility', 'authors': [{'first': 'S', 'middle': [], 'last': 'Hasan', 'suffix': ''}, {'first': 'C M', 'middle': [], 'last': 'Schneider', 'suffix': ''}, {'first': 'S V', 'middle': [], 'last': 'Ukkusuria', 'suffix': ''}], 'year': 2012, 'venue': 'J Stat Phys', 'volume': '1', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF77': {'ref_id': 'b77', 'title': 'Unraveling the origin of exponential law in intra-urban human mobility', 'authors': [{'first': 'X', 'middle': [], 'last': 'Liang', 'suffix': ''}, {'first': 'J C', 'middle': [], 'last': 'Zhao', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Dong', 'suffix': ''}], 'year': 2013, 'venue': 'Sci Rep', 'volume': '', 'issn': '3', 'pages': '', 'other_ids': {}}, 'BIBREF78': {'ref_id': 'b78', 'title': 'Diversity of individual mobility patterns and emergence of aggregated scaling laws', 'authors': [{'first': 'X Y', 'middle': [], 'last': 'Yan', 'suffix': ''}, {'first': 'X P', 'middle': [], 'last': 'Han', 'suffix': ''}, {'first': 'B H', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2013, 'venue': 'Sci Rep', 'volume': '', 'issn': '3', 'pages': '', 'other_ids': {}}, 'BIBREF79': {'ref_id': 'b79', 'title': 'The scaling of contact rates with population density for the infectious disease models', 'authors': [{'first': 'H', 'middle': [], 'last': 'Hu', 'suffix': ''}, {'first': 'K', 'middle': [], 'last': 'Nigmatulina', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Eckhoff', 'suffix': ''}], 'year': 2013, 'venue': 'Math Biosci', 'volume': '244', 'issn': '', 'pages': '125--134', 'other_ids': {}}, 'BIBREF81': {'ref_id': 'b81', 'title': 'Globally networked risks and how to respond', 'authors': [{'first': 'D', 'middle': [], 'last': 'Helbing', 'suffix': ''}], 'year': 2013, 'venue': 'Nature', 'volume': '497', 'issn': '', 'pages': '51--59', 'other_ids': {}}, 'BIBREF82': {'ref_id': 'b82', 'title': 'Influenza A (H7N9) and the importance of digital epidemiology', 'authors': [{'first': 'M', 'middle': [], 'last': 'Salathé', 'suffix': ''}, {'first': 'C C', 'middle': [], 'last': 'Freifeld', 'suffix': ''}, {'first': 'S R', 'middle': [], 'last': 'Mekaru', 'suffix': ''}], 'year': 2013, 'venue': 'N Engl J Med', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1056/NEJMp1307752']}}, 'BIBREF83': {'ref_id': 'b83', 'title': 'A systematic review of studies on forecasting the dynamics of influenza outbreaks', 'authors': [{'first': 'E O', 'middle': [], 'last': 'Nsoesie', 'suffix': ''}, {'first': 'J S', 'middle': [], 'last': 'Brownstein', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Ramakrishnan', 'suffix': ''}], 'year': 2013, 'venue': 'Influenza Other Respi Viruses', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1111/irv.12226']}}, 'BIBREF84': {'ref_id': 'b84', 'title': 'A mathematical model for the global spread of influenza', 'authors': [{'first': 'L A', 'middle': [], 'last': 'Rvachev', 'suffix': ''}, {'first': 'Longini', 'middle': [], 'last': 'Jr I M', 'suffix': ''}], 'year': 1985, 'venue': 'Math Biosci', 'volume': '75', 'issn': '', 'pages': '3--22', 'other_ids': {}}, 'BIBREF85': {'ref_id': 'b85', 'title': 'Human mobility and spatial disease dynamics', 'authors': [{'first': 'D', 'middle': [], 'last': 'Brockmann', 'suffix': ''}], 'year': 2009, 'venue': 'Reviews of nonlinear dynamics and complexity', 'volume': '2', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF86': {'ref_id': 'b86', 'title': 'Reactiondiffusion processes and metapopulation models in heterogeneous networks', 'authors': [{'first': 'V', 'middle': [], 'last': 'Colizza', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Pastor-Satorras', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2007, 'venue': 'Nat Phys', 'volume': '3', 'issn': '', 'pages': '276--282', 'other_ids': {}}, 'BIBREF87': {'ref_id': 'b87', 'title': 'No reuse allowed without permission. The copyright holder for this preprint (which was not peer-reviewed) is the', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1101/003889']}}, 'BIBREF88': {'ref_id': 'b88', 'title': 'Epidemic modeling in metapopulation systems with heterogeneous coupling pattern: Theory and simulations', 'authors': [{'first': 'V', 'middle': [], 'last': 'Colizza', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2008, 'venue': 'J Theor Biol', 'volume': '251', 'issn': '', 'pages': '450--467', 'other_ids': {}}, 'BIBREF89': {'ref_id': 'b89', 'title': 'Epidemic spreading by objective traveling', 'authors': [{'first': 'M', 'middle': [], 'last': 'Tang', 'suffix': ''}, {'first': 'Z H', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'B', 'middle': ['W'], 'last': 'Li', 'suffix': ''}], 'year': 2009, 'venue': 'Europhys Lett', 'volume': '87', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF90': {'ref_id': 'b90', 'title': 'Phase transitions in contagion processes mediated by recurrent mobility patterns', 'authors': [{'first': 'D', 'middle': [], 'last': 'Balcan', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2011, 'venue': 'Nat Phys', 'volume': '7', 'issn': '', 'pages': '581--586', 'other_ids': {}}, 'BIBREF91': {'ref_id': 'b91', 'title': 'Mobile phones as traffic probes: practices, prospects and issues', 'authors': [{'first': 'G', 'middle': [], 'last': 'Rose', 'suffix': ''}], 'year': 2006, 'venue': 'Transport Reviews', 'volume': '26', 'issn': '', 'pages': '275--291', 'other_ids': {}}, 'BIBREF92': {'ref_id': 'b92', 'title': 'Understanding individual human mobility patterns', 'authors': [{'first': 'M C', 'middle': [], 'last': 'González', 'suffix': ''}, {'first': 'C A', 'middle': [], 'last': 'Hidalgo', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barabási', 'suffix': ''}], 'year': 2008, 'venue': 'Nature', 'volume': '453', 'issn': '', 'pages': '779--782', 'other_ids': {}}, 'BIBREF93': {'ref_id': 'b93', 'title': 'Multiscale mobility networks and the spatial spreading of infectious diseases', 'authors': [{'first': 'D', 'middle': [], 'last': 'Balcan', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Colizza', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Gonçalves', 'suffix': ''}], 'year': 2009, 'venue': 'Proc Natl Acad Sci', 'volume': '106', 'issn': '', 'pages': '21484--21489', 'other_ids': {}}, 'BIBREF94': {'ref_id': 'b94', 'title': 'Limits of predictability in human mobility', 'authors': [{'first': 'C M', 'middle': [], 'last': 'Song', 'suffix': ''}, {'first': 'Z H', 'middle': [], 'last': 'Qu', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Blumm', 'suffix': ''}], 'year': 2010, 'venue': 'Science', 'volume': '327', 'issn': '', 'pages': '1018--1021', 'other_ids': {}}, 'BIBREF96': {'ref_id': 'b96', 'title': 'Invasion threshold in structured populations with recurrent mobility patterns', 'authors': [{'first': 'D', 'middle': [], 'last': 'Balcan', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Vespignani', 'suffix': ''}], 'year': 2012, 'venue': 'J Theor Biol', 'volume': '293', 'issn': '', 'pages': '87--100', 'other_ids': {}}, 'BIBREF97': {'ref_id': 'b97', 'title': 'Natural human mobility patterns and spatial spread of infectious diseases', 'authors': [{'first': 'V', 'middle': [], 'last': 'Belik', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Geisel', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Brockmann', 'suffix': ''}], 'year': 2011, 'venue': 'Phys Rev X', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF98': {'ref_id': 'b98', 'title': 'Capturing human behaviour', 'authors': [{'first': 'N', 'middle': [], 'last': 'Ferguson', 'suffix': ''}], 'year': 2007, 'venue': 'Nature', 'volume': '446', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF99': {'ref_id': 'b99', 'title': 'Adaptive networks: Theory, models and applications', 'authors': [], 'year': 2009, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF100': {'ref_id': 'b100', 'title': 'Modeling the interplay between human behavior and the spread of infectious diseases', 'authors': [], 'year': 2013, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF101': {'ref_id': 'b101', 'title': 'Group-interest versus self-interest in smallpox vaccination policy', 'authors': [{'first': 'C T', 'middle': [], 'last': 'Bauch', 'suffix': ''}, {'first': 'A P', 'middle': [], 'last': 'Galvani', 'suffix': ''}, {'first': 'D J D', 'middle': [], 'last': 'Earn', 'suffix': ''}], 'year': 2003, 'venue': 'Proc Natl Acad Sci', 'volume': '100', 'issn': '', 'pages': '10564--10567', 'other_ids': {}}, 'BIBREF102': {'ref_id': 'b102', 'title': 'Coupled contagion dynamics of fear and disease: mathematical and computational explorations', 'authors': [{'first': 'J M', 'middle': [], 'last': 'Epstein', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Parker', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Cummings', 'suffix': ''}], 'year': 2008, 'venue': 'Plos One', 'volume': '3', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF103': {'ref_id': 'b103', 'title': 'The spread of awareness and its impact on epidemic outbreaks', 'authors': [{'first': 'S', 'middle': [], 'last': 'Funk', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Gilad', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Watkins', 'suffix': ''}], 'year': 2009, 'venue': 'Proc Natl Acad Sci', 'volume': '106', 'issn': '', 'pages': '6872--6877', 'other_ids': {}}, 'BIBREF104': {'ref_id': 'b104', 'title': 'Hub nodes inhibit the outbreak of epidemic under voluntary vaccination', 'authors': [{'first': 'H F', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'C S', 'middle': [], 'last': 'Zhou', 'suffix': ''}], 'year': 2010, 'venue': 'New J Phys', 'volume': '12', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF105': {'ref_id': 'b105', 'title': 'Towards a characterization of behavior-disease models', 'authors': [{'first': 'N', 'middle': [], 'last': 'Perra', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Balcan', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Gonçalves', 'suffix': ''}], 'year': 2011, 'venue': 'Plos One', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF106': {'ref_id': 'b106', 'title': 'The impact of awareness on epidemic spreading in networks', 'authors': [{'first': 'Q', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Fu', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Small', 'suffix': ''}], 'year': 2012, 'venue': 'Chaos', 'volume': '22', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF107': {'ref_id': 'b107', 'title': 'Impacts of subsidy policies on vaccination decisions in contact networks', 'authors': [{'first': 'H F', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'Z X', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'X K', 'middle': [], 'last': 'Xu', 'suffix': ''}], 'year': 2013, 'venue': 'Phys Rev E', 'volume': '88', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF108': {'ref_id': 'b108', 'title': 'Epidemic spreading with information-driven vaccination', 'authors': [{'first': 'Z Y', 'middle': [], 'last': 'Ruan', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Tang', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Liu', 'suffix': ''}], 'year': 2013, 'venue': 'Phy Rev E', 'volume': '86', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF109': {'ref_id': 'b109', 'title': 'Effects of delayed recovery and nonuniform transmission on the spreading of diseases in complex networks', 'authors': [{'first': 'C Y', 'middle': [], 'last': 'Xia', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Sanz', 'suffix': ''}], 'year': 2013, 'venue': 'Physica A', 'volume': '392', 'issn': '', 'pages': '1577--1585', 'other_ids': {}}, 'BIBREF110': {'ref_id': 'b110', 'title': 'Epidemic spreading on contact networks with adaptive weights', 'authors': [{'first': 'G H', 'middle': [], 'last': 'Zhu', 'suffix': ''}, {'first': 'G R', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'X J', 'middle': [], 'last': 'Xu', 'suffix': ''}], 'year': 2013, 'venue': 'J Theor Biol', 'volume': '317', 'issn': '', 'pages': '133--139', 'other_ids': {}}, 'BIBREF111': {'ref_id': 'b111', 'title': 'Modeling human mobility responses to the large-scale spreading of infectious diseases', 'authors': [{'first': 'S', 'middle': [], 'last': 'Meloni', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Perra', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Arenas', 'suffix': ''}], 'year': 2011, 'venue': 'Sci Rep', 'volume': '1', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF112': {'ref_id': 'b112', 'title': 'Safety-information-driven human mobility patterns with metapopulation epidemic dynamics Sci Rep', 'authors': [{'first': 'B', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Cao', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Suzuki', 'suffix': ''}], 'year': 2012, 'venue': '', 'volume': '2', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF113': {'ref_id': 'b113', 'title': 'Comparing large-scale computational approaches to epidemic modeling: Agent-based versus structured metapopulation models', 'authors': [{'first': 'M', 'middle': [], 'last': 'Ajelli', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Gonçalves', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Balcan', 'suffix': ''}], 'year': 2010, 'venue': 'BMC Infect Dis', 'volume': '10', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF114': {'ref_id': 'b114', 'title': 'A high-resolution human contact network for infectious disease transmission', 'authors': [{'first': 'M', 'middle': [], 'last': 'Salathé', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Kazandjieva', 'suffix': ''}, {'first': 'J W', 'middle': [], 'last': 'Lee', 'suffix': ''}], 'year': 2010, 'venue': 'Proc Natl Acad Sci', 'volume': '107', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF115': {'ref_id': 'b115', 'title': \"What's in a crowd? Analysis of face-to-face behavioral networks\", 'authors': [{'first': 'L', 'middle': [], 'last': 'Isella', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Stehlé', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barrat', 'suffix': ''}], 'year': 2011, 'venue': 'J Theor Biol', 'volume': '271', 'issn': '', 'pages': '166--180', 'other_ids': {}}, 'BIBREF116': {'ref_id': 'b116', 'title': 'Predictability of conversation partners', 'authors': [{'first': 'T', 'middle': [], 'last': 'Takaguchi', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Nakamura', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Sato', 'suffix': ''}], 'year': 2011, 'venue': 'Phys Rev X', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF117': {'ref_id': 'b117', 'title': 'Towards a temporal network analysis of interactive WiFi users', 'authors': [{'first': 'Y', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Y Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}], 'year': 2012, 'venue': 'Europhys Lett', 'volume': '98', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF118': {'ref_id': 'b118', 'title': \"Characterizing large-scale population's indoor spatio-temporal interactive behaviors\", 'authors': [{'first': 'Y Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}], 'year': 2012, 'venue': \"Proceeding of the ACM SIGKDD International Workshop on Urban Computing (UrbComp'12)\", 'volume': '', 'issn': '', 'pages': '25--32', 'other_ids': {}}, 'BIBREF119': {'ref_id': 'b119', 'title': 'Temporal dynamics and impact of event interactions in cyber-social populations', 'authors': [{'first': 'Y Q', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}], 'year': 2013, 'venue': 'Chaos', 'volume': '23', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF120': {'ref_id': 'b120', 'title': 'Temporal Networks', 'authors': [], 'year': 2013, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF121': {'ref_id': 'b121', 'title': 'Locationspecific patterns of exposure to recent pre-pandemic strains of influenza A in southern China', 'authors': [{'first': 'J', 'middle': [], 'last': 'Lessler', 'suffix': ''}, {'first': 'D A T', 'middle': [], 'last': 'Cummings', 'suffix': ''}, {'first': 'J M', 'middle': [], 'last': 'Read', 'suffix': ''}], 'year': 2011, 'venue': 'Nat Commun', 'volume': '2', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF122': {'ref_id': 'b122', 'title': 'An exploratory spatial analysis of pneumonia and influenza hospitalizations in Ontario by age and gender', 'authors': [{'first': 'E J', 'middle': [], 'last': 'Crighton', 'suffix': ''}, {'first': 'S J', 'middle': [], 'last': 'Elliott', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Moineddin', 'suffix': ''}], 'year': 2007, 'venue': 'Epidemiol Infect', 'volume': '135', 'issn': '', 'pages': '253--261', 'other_ids': {}}, 'BIBREF123': {'ref_id': 'b123', 'title': 'How human locationspecific contact patterns impact spatial transmission between populations? Sci Rep', 'authors': [{'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Zhang', 'suffix': ''}], 'year': 2013, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF124': {'ref_id': 'b124', 'title': 'The impact of locationspecific contact pattern on the sir epidemic transmission between populations', 'authors': [{'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2013, 'venue': 'Int J Bifurcat Chaos', 'volume': '23', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF125': {'ref_id': 'b125', 'title': 'Mathematical epidemiology of infectious diseases: Model building, analysis and interpretation', 'authors': [{'first': 'O', 'middle': [], 'last': 'Diekmann', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Heesterbeek', 'suffix': ''}], 'year': 2000, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF126': {'ref_id': 'b126', 'title': \"Heterogeneous length of stay of hosts' movements and spatial epidemic spread\", 'authors': [{'first': 'C', 'middle': [], 'last': 'Poletto', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Tizzoni', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Colizza', 'suffix': ''}], 'year': 2012, 'venue': 'Sci Rep', 'volume': '2', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF127': {'ref_id': 'b127', 'title': 'Strategy to suppress epidemic explosion in heterogeneous metapopulation networks', 'authors': [{'first': 'C S', 'middle': [], 'last': 'Shen', 'suffix': ''}, {'first': 'H S', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Hou', 'suffix': ''}], 'year': 2012, 'venue': 'Phys Rev E', 'volume': '86', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF128': {'ref_id': 'b128', 'title': 'Rendezvous effects in the diffusion process on bipartite metapopulation networks', 'authors': [{'first': 'L', 'middle': [], 'last': 'Cao', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2011, 'venue': 'Phys Rev E', 'volume': '84', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF129': {'ref_id': 'b129', 'title': 'Will travel restrictions control the international spread of pandemic influenza', 'authors': [{'first': 'Déirdre', 'middle': [], 'last': 'Hollingsworth', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Ferguson', 'suffix': ''}, {'first': 'N M', 'middle': [], 'last': 'Anderson', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2006, 'venue': 'Nat Med', 'volume': '12', 'issn': '', 'pages': '497--499', 'other_ids': {}}, 'BIBREF130': {'ref_id': 'b130', 'title': 'Delaying the international spread of pandemic influenza', 'authors': [{'first': 'B S', 'middle': [], 'last': 'Cooper', 'suffix': ''}, {'first': 'R J', 'middle': [], 'last': 'Pitman', 'suffix': ''}, {'first': 'W J', 'middle': [], 'last': 'Edmunds', 'suffix': ''}], 'year': 2006, 'venue': 'PLoS Med', 'volume': '3', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF131': {'ref_id': 'b131', 'title': 'A simple explanation for the low impact of border control as a countermeasure to the spread of an infectious disease', 'authors': [{'first': 'G S', 'middle': [], 'last': 'Tomba', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Wallinga', 'suffix': ''}], 'year': 2008, 'venue': 'Math Biosci', 'volume': '214', 'issn': '', 'pages': '70--72', 'other_ids': {}}, 'BIBREF132': {'ref_id': 'b132', 'title': 'Human mobility networks, travel restrictions, and the global spread of 2009 H1N1 pandemic', 'authors': [{'first': 'P', 'middle': [], 'last': 'Bajardi', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Poletto', 'suffix': ''}, {'first': 'J J', 'middle': [], 'last': 'Ramasco', 'suffix': ''}], 'year': 2011, 'venue': 'PLoS One', 'volume': '6', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF133': {'ref_id': 'b133', 'title': 'Strategies for mitigating an influenza pandemic', 'authors': [{'first': 'N M', 'middle': [], 'last': 'Ferguson', 'suffix': ''}, {'first': 'D A T', 'middle': [], 'last': 'Cummings', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Fraser', 'suffix': ''}], 'year': 2006, 'venue': 'Nature', 'volume': '442', 'issn': '', 'pages': '448--450', 'other_ids': {}}, 'BIBREF134': {'ref_id': 'b134', 'title': 'Reducing the impact of the next influenza pandemic using household-based public health interventions', 'authors': [{'first': 'J T', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Riley', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Fraser', 'suffix': ''}], 'year': 2006, 'venue': 'PLoS Med', 'volume': '3', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF135': {'ref_id': 'b135', 'title': 'No reuse allowed without permission. The copyright holder for this preprint (which was not peer-reviewed) is the', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1101/003889']}}, 'BIBREF136': {'ref_id': 'b136', 'title': 'Modeling the worldwide spread of pandemic influenza: Baseline case and containment interventions', 'authors': [{'first': 'V', 'middle': [], 'last': 'Colizza', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Barrat', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Barthélemy', 'suffix': ''}], 'year': 2007, 'venue': 'PLoS Med', 'volume': '4', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF137': {'ref_id': 'b137', 'title': 'Optimizing the dose of prepandemic influenza vaccines to reduce the infection attack rate', 'authors': [{'first': 'S', 'middle': [], 'last': 'Riley', 'suffix': ''}, {'first': 'J T', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Leung', 'suffix': ''}], 'year': 2007, 'venue': 'PLoS Med', 'volume': '4', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF138': {'ref_id': 'b138', 'title': 'Modeling targeted layered containment of an influenza pandemic in the United States', 'authors': [{'first': 'Elizabeth', 'middle': [], 'last': 'Halloran', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Ferguson', 'suffix': ''}, {'first': 'N M', 'middle': [], 'last': 'Eubank', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2008, 'venue': 'Proc Natl Acad Sci', 'volume': '105', 'issn': '', 'pages': '4639--4644', 'other_ids': {}}, 'BIBREF139': {'ref_id': 'b139', 'title': 'Logistical feasibility and potential benefits of a population-wide passiveimmunotherapy program during an influenza pandemic', 'authors': [{'first': 'J T', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'C K', 'middle': [], 'last': 'Lee', 'suffix': ''}, {'first': 'B J', 'middle': [], 'last': 'Cowling', 'suffix': ''}], 'year': 2010, 'venue': 'Proc Natl Acad Sci', 'volume': '107', 'issn': '', 'pages': '3269--3274', 'other_ids': {}}, 'BIBREF140': {'ref_id': 'b140', 'title': 'Estimating the value of containment strategies in delaying the arrival time of an influenza pandemic: A case study of travel restriction and patient isolation', 'authors': [{'first': 'L', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'T Y', 'middle': [], 'last': 'Huang', 'suffix': ''}], 'year': 2012, 'venue': 'Phys Rev E', 'volume': '86', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF141': {'ref_id': 'b141', 'title': 'Will vaccines be available for the next influenza pandemic?', 'authors': [{'first': 'K', 'middle': [], 'last': 'Stohr', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Esveld', 'suffix': ''}], 'year': 2004, 'venue': 'Science', 'volume': '306', 'issn': '', 'pages': '2195--2196', 'other_ids': {}}, 'BIBREF142': {'ref_id': 'b142', 'title': 'Reflections on pandemic (H1N1) 2009 and the international response', 'authors': [{'first': 'G M', 'middle': [], 'last': 'Leung', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Nicoll', 'suffix': ''}], 'year': 2010, 'venue': 'PLoS Med', 'volume': '7', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF143': {'ref_id': 'b143', 'title': 'Vaccine production, distribution, access, and uptake', 'authors': [{'first': 'J', 'middle': [], 'last': 'Smith', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Lipsitch', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Almond', 'suffix': ''}], 'year': 2011, 'venue': 'Lancet', 'volume': '378', 'issn': '', 'pages': '428--438', 'other_ids': {}}, 'BIBREF144': {'ref_id': 'b144', 'title': 'Antiviral resistance in influenza viruses-Implications for management and pandemic response', 'authors': [{'first': 'F', 'middle': [], 'last': 'Hayden', 'suffix': ''}], 'year': 2006, 'venue': 'N Engl J Med', 'volume': '354', 'issn': '', 'pages': '785--788', 'other_ids': {}}, 'BIBREF145': {'ref_id': 'b145', 'title': 'Antiviral resistance and the control of pandemic influenza', 'authors': [{'first': 'M', 'middle': [], 'last': 'Lipsitch', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Cohen', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Murray', 'suffix': ''}], 'year': 2007, 'venue': 'PLoS Med', 'volume': '4', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF146': {'ref_id': 'b146', 'title': 'Hedging against antiviral resistance during the next influenza pandemic using small stockpiles of an alternative chemotherapy', 'authors': [{'first': 'J T', 'middle': [], 'last': 'Wu', 'suffix': ''}, {'first': 'G M', 'middle': [], 'last': 'Leung', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Lipsitch', 'suffix': ''}], 'year': 2009, 'venue': 'PLoS Med', 'volume': '6', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF147': {'ref_id': 'b147', 'title': 'The hidden geometry of complex, network-driven contagion phenomena', 'authors': [{'first': 'D', 'middle': [], 'last': 'Brockmann', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Helbing', 'suffix': ''}], 'year': 2013, 'venue': 'Science', 'volume': '342', 'issn': '', 'pages': '1337--1342', 'other_ids': {}}, 'BIBREF148': {'ref_id': 'b148', 'title': \"Rumors in a network: Who's the culprit?\", 'authors': [{'first': 'D', 'middle': [], 'last': 'Shah', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Zaman', 'suffix': ''}], 'year': 2011, 'venue': 'IEEE T Inform Theory', 'volume': '57', 'issn': '', 'pages': '5163--5181', 'other_ids': {}}, 'BIBREF149': {'ref_id': 'b149', 'title': 'Identifying infection sources and regions in large networks', 'authors': [{'first': 'W Q', 'middle': [], 'last': 'Luo', 'suffix': ''}, {'first': 'W P', 'middle': [], 'last': 'Tay', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Leng', 'suffix': ''}], 'year': 2013, 'venue': 'IEEE T Signal Proces', 'volume': '61', 'issn': '', 'pages': '2850--2865', 'other_ids': {}}, 'BIBREF150': {'ref_id': 'b150', 'title': 'Predicting catastrophes in nonlinear dynamical systems by compressive sensing', 'authors': [{'first': 'W X', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'Yang', 'middle': ['R'], 'last': 'Lai', 'suffix': ''}, {'first': 'Y C', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2011, 'venue': 'Phys. Rev. Lett', 'volume': '106', 'issn': '', 'pages': '', 'other_ids': {}}}, 'ref_entries': {'FIGREF0': {'text': 'Color online) Schematic illustrations of the SIS (a) and the SIR (b) compartment models, where β, µ denote the transmission rate and the recovery rate, respectively.', 'latex': None, 'type': 'figure'}, 'FIGREF1': {'text': 'Color online) Illustration of the individual-network frame of the networked metapopulation model. a The model is composed of a network of subpopulations. The disease transmission among subpopulations stems from the mobility of infected individuals. b Each subpopulation refers to a location, in which a population of individuals interplays according to the compartment rule (e.g., SIR) that induces local disease outbreaks. Individuals are transferred among subpopulations via mobility networks.', 'latex': None, 'type': 'figure'}, 'FIGREF3': {'text': 'Color online) Air transportation network (a) vs. commuting network (b) of the US. Long-range airlines dominate the air transportation network, whereas the commuting routes are much geographically localized.', 'latex': None, 'type': 'figure'}}, 'back_matter': [{'text': 'Acknowledgements We appreciate the two anonymous referees for their valuable comments. We are grateful to the instructive discussions with Guanrong Chen, Joseph T. Wu, Shlomo Havlin, Ming Tang ', 'cite_spans': [], 'ref_spans': [{'start': 188, 'end': 192, 'text': 'Tang', 'ref_id': None}], 'section': 'acknowledgement'}]}\n", - "{'paper_id': 'f33c6d94b0efaa198f8f3f20e644625fa3fe10d2', 'metadata': {'title': 'Sequencing of the human IG light chain loci from a hydatidiform mole BAC library reveals locus-specific signatures of genetic diversity', 'authors': [{'first': 'Corey', 'middle': ['T'], 'last': 'Watson', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Simon Fraser University', 'location': {'postCode': 'V5A 1S6', 'settlement': 'Burnaby', 'region': 'British Columbia', 'country': 'Canada'}}, 'email': ''}, {'first': 'Karyn', 'middle': ['Meltz'], 'last': 'Steinberg', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University of Washington', 'location': {'postCode': '98195', 'settlement': 'Seattle', 'region': 'WA', 'country': 'USA'}}, 'email': ''}, {'first': 'Tina', 'middle': ['A'], 'last': 'Graves', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Washington University', 'location': {'postCode': '63108', 'settlement': 'St. Louis', 'region': 'MO', 'country': 'USA'}}, 'email': ''}, {'first': 'Rene', 'middle': ['L'], 'last': 'Warren', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'BC Cancer Agency', 'location': {'postCode': 'V5Z 4S6', 'settlement': 'Vancouver', 'region': 'British Columbia', 'country': 'Canada'}}, 'email': ''}, {'first': 'Maika', 'middle': [], 'last': 'Malig', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University of Washington', 'location': {'postCode': '98195', 'settlement': 'Seattle', 'region': 'WA', 'country': 'USA'}}, 'email': ''}, {'first': 'Jacqueline', 'middle': [], 'last': 'Schein', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'BC Cancer Agency', 'location': {'postCode': 'V5Z 4S6', 'settlement': 'Vancouver', 'region': 'British Columbia', 'country': 'Canada'}}, 'email': ''}, {'first': 'Richard', 'middle': ['K'], 'last': 'Wilson', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Washington University', 'location': {'postCode': '63108', 'settlement': 'St. Louis', 'region': 'MO', 'country': 'USA'}}, 'email': ''}, {'first': 'Robert', 'middle': ['A'], 'last': 'Holt', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'BC Cancer Agency', 'location': {'postCode': 'V5Z 4S6', 'settlement': 'Vancouver', 'region': 'British Columbia', 'country': 'Canada'}}, 'email': ''}, {'first': 'Evan', 'middle': ['E'], 'last': 'Eichler', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University of Washington', 'location': {'postCode': '98195', 'settlement': 'Seattle', 'region': 'WA', 'country': 'USA'}}, 'email': ''}, {'first': 'Felix', 'middle': [], 'last': 'Breden', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Simon Fraser University', 'location': {'postCode': 'V5A 1S6', 'settlement': 'Burnaby', 'region': 'British Columbia', 'country': 'Canada'}}, 'email': ''}]}, 'abstract': [{'text': 'Germline variation at immunoglobulin gene (IG) loci is critical for pathogen-mediated immunity, but establishing complete reference sequences in these regions is problematic because of segmental duplications and somatically rearranged source DNA. We sequenced BAC clones from the essentially haploid hydatidiform mole, CHM1, across the light chain IG loci, kappa (IGK) and lambda (IGL), creating single haplotype representations of these regions. The IGL haplotype is 1.25Mb of contiguous sequence with four novel V gene and one novel C gene alleles and an 11.9kbp insertion. The IGK haplotype consists of two 644kbp proximal and 466kbp distal contigs separated by a gap also present in the reference genome sequence. Our effort added an additional 49kbp of unique sequence extending into this gap. The IGK haplotype contains six novel V gene and one novel J gene alleles and a 16.7kbp region with increased sequence identity between the two IGK contigs, exhibiting signatures of interlocus gene conversion. Our data facilitated the first comparison of nucleotide diversity between the light and IG heavy (IGH) chain haplotypes within a single genome, revealing a three to six fold enrichment in the IGH locus, supporting the theory that the heavy chain may be more important in determining antigenic specificity.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': 'Antibodies are essential components of the immune system that play key roles in processes associated with innate and adaptive immunity 1 . They are expressed by B-cells as either cell surface receptors or secreted proteins, and are formed by two pairs of identical \"heavy\" and \"light\" immunoglobulin (IG) protein chains, encoded by genes located at three primary loci in the human genome: the IG heavy chain (IGH) at 14q32.33, and the two IG light chain regions, lambda (IGL) and kappa (IGK), located at 22q11.2 and 2p11.2 2 . Specifically, through a unique mechanism referred to as V-(D)-J recombination 3 , individual Variable (V), Diversity (D), and Joining (J) genes at the IGH locus, and V and J genes at either the IGK or IGL loci recombine somatically at the DNA level to generate templates for the subsequent transcription and translation of antibody heavy and light chains, respectively. V-(D)-J recombination is accompanied by the random addition and deletion of nucleotides at the junctions of the combined V, D, and J genes by terminal deoxynucleotide transferase (TdT). The extreme variability observed in expressed antigen-naïve B-cell antibody repertoires is due to this combinatorial and junctional diversity, and partly ensures that the immune system is able to recognize and mount effective immune responses against a diverse range of potential pathogens. At the population and species level, IG haplotype and allelic variation also make important contributions to the diversity of expressed antibody repertoires [4] [5] [6] [7] ; however, broader roles of IG genetic polymorphism in antibody function are generally less well understood 8 .', 'cite_spans': [{'start': 605, 'end': 606, 'text': '3', 'ref_id': 'BIBREF2'}, {'start': 1531, 'end': 1534, 'text': '[4]', 'ref_id': 'BIBREF3'}, {'start': 1535, 'end': 1538, 'text': '[5]', 'ref_id': 'BIBREF4'}, {'start': 1539, 'end': 1542, 'text': '[6]', 'ref_id': 'BIBREF5'}, {'start': 1543, 'end': 1546, 'text': '[7]', 'ref_id': 'BIBREF6'}, {'start': 1655, 'end': 1656, 'text': '8', 'ref_id': 'BIBREF7'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Over the past three decades, extensive catalogues of genetic polymorphisms have been established for human IGH, IGK, and IGL genes (IMGT; the international ImMunoGeneTics information system; www.imgt.org 2, 9 ) . Importantly, not only has IG genetic diversity recently been implicated in inter-individual variation in expressed antibody repertoires [4] [5] [6] , but genetic variants in both IG coding regions and well-characterized regulatory motifs are also known to influence antibody expression and function, and mediate risk of disease phenotypes 10-12 . However, our understanding of germline variability at the IGH, IGK, and IGL loci remains severely limited, especially in terms of haplotype structure (i.e., large segmental duplications and deletions) as well as coding and non-coding sequence polymorphisms. We have begun to uncover this variability in the IGH locus, leading to complete nucleotide resolution descriptions of large structural variants (insertions, deletions, duplications, and complex rearrangements), including novel functional IGHV genes and alleles 7 .', 'cite_spans': [{'start': 204, 'end': 206, 'text': '2,', 'ref_id': 'BIBREF1'}, {'start': 207, 'end': 210, 'text': '9 )', 'ref_id': 'BIBREF8'}, {'start': 349, 'end': 352, 'text': '[4]', 'ref_id': 'BIBREF3'}, {'start': 353, 'end': 356, 'text': '[5]', 'ref_id': 'BIBREF4'}, {'start': 357, 'end': 360, 'text': '[6]', 'ref_id': 'BIBREF5'}, {'start': 552, 'end': 559, 'text': '10-12 .', 'ref_id': None}, {'start': 1079, 'end': 1080, 'text': '7', 'ref_id': 'BIBREF6'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'In the present study we use the same approach as our previous characterization of IGH to analyze complete haploid reconstructions of the IGK and IGL loci, which to date, compared to IGH, remain less well investigated. The IGL region has only been fully sequenced and analyzed once in its entirety 13 , using multiple cosmid and BAC library resources. The region spans approximately 0.9 MB, and includes 69 IGLV, seven IGLJ, and seven IGLC functional/open reading frame (ORF) genes and pseudogenes. Additional V, J, and C genes not present in this haplotype are known to occur as insertion variants in the human population 2, [14] [15] [16] . Similarly, the initial sequence of the IGK locus was generated from a composite assembly of cosmid, bacteriophage, and BAC clone libraries 17 . A unique feature of the IGK locus is that it includes two large inverted segmental duplications (SDs) that comprise distinct V-gene containing regions (termed proximal and distal); these regions remain separated by a large, currently unsequenced, assembly gap. The proximal region spans 0.54 MB and includes 69 IGKV functional/ORF genes and pseudogenes, whereas the distal region spans 0.43 MB and includes 62 V genes and pseudogenes (distal V genes are denoted by a \"D\"; e.g., IGKV1D-13), five functional IGKJ genes, and a single functional IGKC gene also reside downstream of the proximal V gene cluster.', 'cite_spans': [{'start': 625, 'end': 629, 'text': '[14]', 'ref_id': 'BIBREF12'}, {'start': 630, 'end': 634, 'text': '[15]', 'ref_id': 'BIBREF13'}, {'start': 635, 'end': 639, 'text': '[16]', 'ref_id': 'BIBREF14'}, {'start': 781, 'end': 783, 'text': '17', 'ref_id': 'BIBREF15'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Haplotypes spanning the IGK proximal region, and a portion of the distal region have also been sequenced using BAC clones from the RPCI-11 library 18 . Both IGL and IGK are known to exhibit V, J, and C gene allelic and structural variation 2,14-16,19-21 .', 'cite_spans': [{'start': 147, 'end': 149, 'text': '18', 'ref_id': 'BIBREF16'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'As a means to improve existing genomic resources in the IG loci, we have sequenced the IGK and IGL gene clusters from the CHORI-17 BAC library (CH17, BACPAC resources), previously constructed from a haploid hydatidiform mole cell line (CHM1htert). Together with data produced previously for the IGH locus from the CH17 library 7 , these sequences represent the first complete human haplotypes of all three IG loci from a single individual. Using these data, we have conducted the first comparison of genetic diversity between the three IG regions in the same haploid genome. Our analyses have facilitated the identification of novel single nucleotide polymorphisms (SNPs) within IGL and IGK gene coding regions and regulatory elements, and also revealed a novel large sequence conversion event between the IGKV proximal and distal regions. In addition, our unbiased tri-locus comparison shows a striking enrichment of structural and nucleotide diversity in the IGH locus, confirming previous suggestions that germline variation within the light chain gene loci is lower than that observed in IGH 15, 22, 23 .', 'cite_spans': [{'start': 1096, 'end': 1099, 'text': '15,', 'ref_id': 'BIBREF13'}, {'start': 1100, 'end': 1103, 'text': '22,', 'ref_id': 'BIBREF20'}, {'start': 1104, 'end': 1106, 'text': '23', 'ref_id': 'BIBREF21'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'We analyzed sequences from 17 CH17 BAC clones (IGL, 9; IGK, 8) comprising tiling paths across the two loci (Figures 1 and 2 ) --one of the clones in IGK, CH17-198L18, had been sequenced previously. Clones unique to either IGK or IGL were then used to construct locuswide contigs; clones in IGKV proximal and distal regions were aligned separately because the gap separating these two regions was not completely filled by the current sequencing effort. Figure 1) . In total, we identified 37 of the 38 known functional/ORF IGLV genes, seven functional J genes, and four functional C genes. The remaining IGLV gene, IGHV5-39 was not found in CH17, consistent with it being an insertion polymorphism 15, 21 . Sequence comparisons within IGLV, IGLJ, and IGLC genes revealed allelic differences between CH17 and the Kawasaki haplotype at six V genes, two J genes, and one C gene (Figure 1 ; Supplementary Table 1 ). Six of these alleles, five of which were novel (IGLV6-57, IGLV11-55, IGLV5-45, IGLV5-48, and IGLC7), included non-synonymous changes. Notably, the novel allele identified at IGHV5-48 included a nonsense mutation that introduced a premature stop codon in the framework 3 region of the protein; 15 additional SNPs were also characterized within the exons of this gene. Prior to this study, only a single allele of IGLV5-48 had been described, classified as an ORF due to an uncharacteristic single nucleotide difference in the heptamer portion of the recombination signal (RS: TACAGT instead of CACAGTG 24 ). SNPs characterized in the remaining four novel functional/ORF alleles were represented in the 1000 genomes project (1KG) dataset 25 . Previously described regulatory motifs 13 , including RS sequences, associated with each of the 37 identified functional/ORF IGLV genes were also inspected for previously uncharacterized variants in the CH17 haplotype, but no SNPs in these regions were identified.', 'cite_spans': [{'start': 697, 'end': 700, 'text': '15,', 'ref_id': 'BIBREF13'}, {'start': 701, 'end': 703, 'text': '21', 'ref_id': 'BIBREF19'}, {'start': 1647, 'end': 1649, 'text': '25', 'ref_id': 'BIBREF23'}], 'ref_spans': [{'start': 107, 'end': 123, 'text': '(Figures 1 and 2', 'ref_id': 'FIGREF9'}, {'start': 452, 'end': 461, 'text': 'Figure 1)', 'ref_id': 'FIGREF9'}, {'start': 874, 'end': 883, 'text': '(Figure 1', 'ref_id': 'FIGREF9'}, {'start': 900, 'end': 907, 'text': 'Table 1', 'ref_id': None}], 'section': 'IGL and IGK reference sequences from the CH17 BAC library'}, {'text': 'Eight BAC clones were analyzed in the IGK locus, forming two independent contigs, one in the proximal region totalling 644 Kbp, and a second in the distal region of the locus totalling Table 2 ). We observed ten allelic differences at IGKV gene loci, and one allelic variant at IGKJ2; ten of these allelic variants, including IGKJ2*04, involved non-synonymous changes. We characterized nine novel alleles that were not represented in IMGT, including three that were observed in the Kawasaki haplotype. In two instances, we observed the presence of alleles that had been previously classified as either \"distal\" or \"proximal\" alleles residing at loci in the alternate location. For example, we found an allele that matched with 100% sequence identity to IGKV1-13*02 at the IGKV1D-13 locus in the CH17 haplotype, for which there had previously been only one allele described, IGKV1D-13*01. IGKV1D-13*01, which differs from IGKV1-13*02 by only a single nucleotide, was initially classified as an ORF due to an abnormal V-heptamer sequence, but was later identified in a productive rearrangement, indicating that it is likely functional 27 .', 'cite_spans': [], 'ref_spans': [{'start': 185, 'end': 192, 'text': 'Table 2', 'ref_id': 'TABREF2'}], 'section': 'IGL and IGK reference sequences from the CH17 BAC library'}, {'text': 'Interestingly, from our analysis of regulatory elements, we found that the allele in the CH17 haplotype at this locus (IGKV1-13*02) was associated with a typical, non-mutated V-heptamer sequence. Thus, this functional V-heptamer variant may also be associated with IGKV1D-13 alleles (e.g., IGKV1D-13*01 noted above), and facilitate their expression.', 'cite_spans': [], 'ref_spans': [], 'section': 'IGL and IGK reference sequences from the CH17 BAC library'}, {'text': 'Discrepancies have also been noted regarding the functionality of IGKV1-8, for which only one coding allele is known. Genomic descriptions of this gene have revealed a 21 bp deletion in an upstream regulatory element, which had been predicted to disruption of promoter function and inhibit expression (mutation observed in the Kawasaki haplotype 28 ); however, IGKV1-8*01 has been shown to be expressed in some cases 27 . Potentially explaining this discrepancy, we found that the previously described IGKV1-8*01 21 bp promoter deletion was not present in the CH17 haplotype, suggesting that this germline indel variant could contribute to variation in the expression of alleles at this locus.', 'cite_spans': [{'start': 417, 'end': 419, 'text': '27', 'ref_id': 'BIBREF25'}], 'ref_spans': [], 'section': 'IGL and IGK reference sequences from the CH17 BAC library'}, {'text': 'A direct comparison of the IGL CH17 and Kawasaki haplotypes revealed the presence of only a single structural variant. This 11.9 Kbp insertion was located in the region between the psuedogenes IGLV7-35 and IGLV2-34 within the BAC, CH17-242N13; the region between these pseudogenes spans ~120 Kbp and is devoid of IG genes. Gene prediction analysis did not identify any genes within the insertion, nor did the insertion disrupt the non-IG related genes, ZNF280B, ZNF280A, and PRAME, located in this region; the breakpoints of the event occurred between ZNF280A and PRAME. No structural variants were observed in the IGK CH17 haplotype.', 'cite_spans': [], 'ref_spans': [], 'section': 'Characterization of structural variants in IGL and IGK CH17 haplotypes'}, {'text': 'We also searched the CH17 haplotypes for all IGL and IGK gene/allele sequences classified as \"not located\" in the IMGT database, meaning that these genes have not been located within IGL or IGK loci. Using this approach we mapped the pseudogene IGLV2-NL1 to the locus in CH17 corresponding to the position of the pseudogene IGLV2-34 in the Kawasaki haplotype;', 'cite_spans': [], 'ref_spans': [], 'section': 'Characterization of structural variants in IGL and IGK CH17 haplotypes'}, {'text': 'IGLV2-NL1 matched CH17 at this locus with 100% sequence identity, suggesting that IGLV2-NL1 and IGLV2-34 genes are allelic rather than distinct loci. No additional exact matches between the CH17 haplotype and other \"not-located\" V genes were observed.', 'cite_spans': [], 'ref_spans': [], 'section': 'Characterization of structural variants in IGL and IGK CH17 haplotypes'}, {'text': 'Previous analysis of sequence similarity between shared homology blocks of proximal and distal segmental duplication units, which comprise the majority of the IGK locus, revealed that these two regions are >98% similar over most of the region 17 . Segmental duplications are known to facilitate sequence exchange via non-allelic homologous recombination and interlocus gene conversion 29, 30 ; however, given the lack of reference sequence data this has not been investigated in the IGK locus. To address this, we conducted pair-wise comparisons of distal and proximal regions from CH17 and Kawasaki haplotypes to search for large tracts of shared sequence ( Figure 3A ). The expectation is that sequence should be most similar between homologous regions in the alternate haplotype, whereas higher similarity between proximal and distal regions within a haplotype would suggest the occurrence of sequence exchange. Using this approach, we identified a large ~16.7 Kbp region that showed higher identity between the proximal and distal units of the Kawasaki haplotype than between the CH17 distal and Kawasaki proximal units ( Figure 3A ). This region included two IGKV genes for which we observed allelic variants between the Kawasaki and CH17 haplotypes. Four-way sequence alignments of this region show that the CH17 distal unit was most unique compared to the other three sequences The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint haplotype. It is important to note, however, that the Kawasaki distal fragment harbors many unique bp differences compared to the other three sequences (blue tick marks, Figure 3B ), which could be suggestive of the occurrence of mutation following the predicted sequence exchange event. Further analysis of this multi-sequence alignment using the DSS method for recombination detection also predicted two potential flanking recombination breakpoints within the expected regions based on visual inspection of the sequence alignment and comparison of sequence similarities. We also analyzed sequence from two BAC clones in the proximal and distal clusters from the RPCI-11 BAC library 18 ; this analysis revealed that these carried the same variants observed in the Kawasaki haplotype.', 'cite_spans': [{'start': 243, 'end': 245, 'text': '17', 'ref_id': 'BIBREF15'}, {'start': 385, 'end': 388, 'text': '29,', 'ref_id': 'BIBREF27'}, {'start': 389, 'end': 391, 'text': '30', 'ref_id': 'BIBREF28'}], 'ref_spans': [{'start': 659, 'end': 668, 'text': 'Figure 3A', 'ref_id': 'FIGREF10'}, {'start': 1126, 'end': 1135, 'text': 'Figure 3A', 'ref_id': 'FIGREF10'}, {'start': 1704, 'end': 1713, 'text': 'Figure 3B', 'ref_id': 'FIGREF10'}], 'section': 'Analysis of proximal and distal regions in CH17 and Kawasaki haplotypes reveals evidence for sequence exchange'}, {'text': 'Compared to the number of allelic variants observed between the IGHV CH17 haplotype and the reference genome (19 allelic variants/40 V gene loci 31 ), V gene allelic variation described in this study for IGL (6 allelic variants/37 V gene loci) and IGK (10 allelic variants/44 V gene loci) was noticeably lower. This prompted us to also compare other genomic characteristics between the three loci. Excluding regions of structural variation between haplotypes, we first generated SNP calls (not including gaps and single bp indels) between the CH17 and reference haplotypes for all three loci; 491, 1046, and 2897 SNPs were identified for IGKV, IGLV, and IGHV, respectively (Table 2; Figure 4 , left panel). After cross referencing these SNPs with dbSNP135 and 1KG datasets, 74, 110, and 407 SNPs in the IGKV, IGLV, and IGHV gene regions were determined to be novel variants, not represented in either dataset. Not surprisingly, given the number of SNPs in the 1KG datasets, fewer SNPs at each locus were represented in dbSNP ( Figure 4 ). We examined these sites in publicly available Illumina data generated from . CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint the CHM1 genomic DNA to determine if the novel SNPs were supported by an orthogonal platform. We identified 73/74, 103/110, and 406/407 sites in IGKV, IGLV, and IGHV, respectively, that are supported by the Illumina data. The discrepancies may represent sequencing errors. The novel SNPs for each region reported in GRCh37 coordinates are in Supplementary Table 3 .', 'cite_spans': [], 'ref_spans': [{'start': 683, 'end': 691, 'text': 'Figure 4', 'ref_id': 'FIGREF11'}, {'start': 1027, 'end': 1035, 'text': 'Figure 4', 'ref_id': 'FIGREF11'}, {'start': 1668, 'end': 1691, 'text': 'Supplementary Table 3', 'ref_id': None}], 'section': 'SNP diversity and genomic features in IGHV, IGLV, and IGKV gene regions'}, {'text': 'Consistent with observations based on V gene allelic variation, SNP density in IGHV (0.0035) was approximately 3-fold higher than in IGLV (0.0012) and 6-fold higher than in IGKV (0.0006); SNP densities were slightly elevated within functional/ORF V genes in each region compared to the calculated locus-wide values ( Table 2 ). An analysis of genomic features in the three loci also showed that the fraction of each locus covered by repeat content was highest in IGHV, whereas that covered by SDs, not surprisingly was highest in IGKV (Table 2 ). However, when SDs associated with the IGKV proximal/distal duplication event were excluded, SD coverage in IGHV (37.7%) was found to be much higher than both of the light chain loci (IGLV, 24.6%; IGKV, 28.2%; Table 2 ). When only SDs exhibiting > 95% sequence identity were considered, this difference was even more striking (IGHV, 28.8%; IGLV, 12.9%; IGKV, 1.0%).', 'cite_spans': [], 'ref_spans': [{'start': 317, 'end': 324, 'text': 'Table 2', 'ref_id': 'TABREF2'}, {'start': 535, 'end': 543, 'text': '(Table 2', 'ref_id': 'TABREF2'}, {'start': 756, 'end': 763, 'text': 'Table 2', 'ref_id': 'TABREF2'}], 'section': 'SNP diversity and genomic features in IGHV, IGLV, and IGKV gene regions'}, {'text': 'Interestingly, in IGHV, SNP density was higher in regions of segmental duplication compared to regions not covered by SDs, especially true when considering only those regions covered by SDs with >95% sequence identity (Table 2) ; however, this difference was not found to be significant after permuting SD vs. non-SD region assignments and re-assessing SNP densities of 10,000 MCFDR samplings (P > 0.05). SNP density was also increased in segmentally duplicated regions of IGKV with >95% sequence identity compared to regions not covered by SDs, but again, this difference was not significant (P > 0.05).', 'cite_spans': [], 'ref_spans': [{'start': 218, 'end': 227, 'text': '(Table 2)', 'ref_id': 'TABREF2'}], 'section': 'SNP diversity and genomic features in IGHV, IGLV, and IGKV gene regions'}, {'text': '. CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint Given the telomeric location of IGHV and the known differences in nucleotide substitution patterns within telomeres compared to the rest of the genome 32 , we next assessed CHM1 SNP density within and around autosomal telomeres and centromeres. We found that mean SNP densities were ~2-fold higher in telomeric regions (within 3 Mbp = 0.001; 1 Mbp = 0.0009) when compared to centromeric regions (within 3 Mbp = 0.0005; 1 Mbp = 0.0004;', 'cite_spans': [], 'ref_spans': [], 'section': 'SNP diversity and genomic features in IGHV, IGLV, and IGKV gene regions'}, {'text': 'Supplementary Figure 2A ). Interestingly, SNP density within 1 Mbp of the q arm telomere of chr14 harbouring IGHV (0.002) was higher than all other telomeric regions, >2-fold higher than the telomeric average (Supplementary Figure 2B) . In contrast, when CHM1 SNP densities within 3 Mbp of all telomeres were compared, the region on the q arm of ch14 (including both IGHV and non-IGHV sequence) no longer stands out, suggesting that IGHV may have some unique properties contributing to higher than average genetic diversity. To place this in the context of the analysis conducted within the IG regions above, we also found that the q arm of ch14 has the second highest overlap with SDs (34%; Supplementary Figure 3 ).', 'cite_spans': [], 'ref_spans': [{'start': 14, 'end': 23, 'text': 'Figure 2A', 'ref_id': None}, {'start': 224, 'end': 234, 'text': 'Figure 2B)', 'ref_id': None}, {'start': 692, 'end': 714, 'text': 'Supplementary Figure 3', 'ref_id': 'FIGREF10'}], 'section': 'SNP diversity and genomic features in IGHV, IGLV, and IGKV gene regions'}, {'text': 'We present data for the first haplotypes of the human IGL and IGK loci from the same haploid genome, representing only the second full-length references constructed for these regions to date. From the CH17 clones, 12 novel alleles were identified in the two loci, including four IGLV alleles, seven IGKV alleles, and one IGLC allele. Two recent assessments of IGK allelic variation --one of a public dataset of 435 expressed sequences 33 , and a second of deepsequenced antibody repertoires from four individuals 34 --concluded that, unlike IGH, IGK allelic datasets are likely to be mostly complete, as only two putative novel alleles were identified from these analyses 33 . IGHV and IGKV repertoire sequencing in a single individual also supports . CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint these observations, finding that nearly 25% of characterized alleles in IGHV were novel, compared to 0% in IGKV 35 . However, the fact that we identified 11 novel light chain V gene alleles from a single haploid genome implies that additional efforts to identify unreported alleles in IGL and IGK are warranted. Importantly, as noted previously in IGHV 7 , SNPs associated with novel alleles identified in IGL and IGK were present in the 1KG dataset, further supporting the notion that the 1KG dataset could serve as a useful resource for future investigations of IG gene diversity and the identification of novel polymorphisms. However, given the prevalence of segmental duplication in the IG loci, it will undoubtedly be essential to consider the impact of paralogous sequence variants (PSVs) when assessing 1KG SNP data in these regions, as complex and duplicated sequence structure are known to confound SNP characterization 36, 37 .', 'cite_spans': [{'start': 435, 'end': 437, 'text': '33', 'ref_id': 'BIBREF31'}, {'start': 672, 'end': 674, 'text': '33', 'ref_id': 'BIBREF31'}, {'start': 1074, 'end': 1076, 'text': '35', 'ref_id': 'BIBREF33'}, {'start': 1891, 'end': 1894, 'text': '36,', 'ref_id': 'BIBREF34'}, {'start': 1895, 'end': 1897, 'text': '37', 'ref_id': 'BIBREF35'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'In addition to novel allelic variants within IGL and IGK coding regions, variants involving regulatory elements of two IGKV genes (IGKV1-8 and IGKV1D-13) were also identified. In both cases, regulatory elements previously associated with alleles at these loci were predicted to inhibit their expression; however, the alleles described for these genes in the CH17 haplotype were associated with regulatory region variants that would be expected to exhibit normal gene expression. The importance of such polymorphisms is that they can result in variable levels of gene expression, including the loss of genes/alleles from expressed repertoires.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'In addition to those noted above, several other IGKV genes are also classified as ORF genes based on irregular regulatory sequence motifs 27 . Notably, IGKV2D-29*02, previously referred to as \"VA2c\", has a non-canonical V-heptamer, but has also been shown to occur in productive rearrangements; thus, like IGKV1-8 and IGKV1D-13 alleles described in CH17, the expression of IGKV2D-29*02 could be explained by a previously undescribed regulatory sequence variant.', 'cite_spans': [{'start': 138, 'end': 140, 'text': '27', 'ref_id': 'BIBREF25'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'Interestingly, a third allele at the IGKV2D-29 locus, \"V2b\", has also been shown to have a . CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint defective RS V-heptamer, which results in decreased expression and has been implicated in susceptibility to Haemophilus influenzae type b infection 10, 38 .', 'cite_spans': [{'start': 451, 'end': 454, 'text': '10,', 'ref_id': 'BIBREF9'}, {'start': 455, 'end': 457, 'text': '38', 'ref_id': 'BIBREF36'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'Perhaps the most important contribution of the full IGK and IGL sequences of the CH17 haplotype data presented here is that, for the first time, locus-wide genetic diversity between IGH, IGL, and IGK could be compared in the same haploid genome in relation to corresponding reference haplotypes. Our first observation from this comparison was that the number of V gene allelic differences among the CH17 haplotypes was highest in IGHV. This finding is perhaps not surprising given that allelic richness is also known to be highest in IGHV based on available genetic data in the IMGT database. In addition, studies of substitution patterns in heavy and light chain V genes have also revealed evidence for increased diversity in IGHV compared to IGLV and IGKV 23, 39 . Strikingly, however, when we extended our comparison to include all SNPs within the three loci, we found that locus-wide genetic diversity was also much higher in IGHV, indicating that increased diversity in this locus is not limited to within V gene coding regions. IGHV SNP density is also higher than that observed for killer cell immunoglobulin-like The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint IGKV gene duplications/deletions (e.g., IGKV1-5 and IGKV3-20) has also more recently been noted from expressed antibody repertoire data, although these await confirmation as germline polymorphisms 34 . Thus, in total, liberal estimates from the literature indicate that only 4-6 light chain functional/ORF V genes are known to occur in SVs, in stark contrast to ~29 in IGHV 7, 8 . In line with this difference, four V gene-containing SVs were identified in the CH17 IGHV haplotype compared to zero defined in IGLV and IGKV.', 'cite_spans': [{'start': 758, 'end': 761, 'text': '23,', 'ref_id': 'BIBREF21'}, {'start': 762, 'end': 764, 'text': '39', 'ref_id': 'BIBREF37'}, {'start': 1467, 'end': 1469, 'text': '34', 'ref_id': 'BIBREF32'}, {'start': 1644, 'end': 1646, 'text': '7,', 'ref_id': 'BIBREF6'}, {'start': 1647, 'end': 1648, 'text': '8', 'ref_id': 'BIBREF7'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'Two factors suggest that the higher rate of SVs in IGH may be attributable to the increased fraction of the locus covered by segmentally duplicated sequences. First, segmental duplications are known to be associated with SVs genome-wide 42 , and second, duplications have been shown specifically to facilitate structural variation in IGH 7 . Segmental duplications and tandem repeats also mediate sequence exchange either through gene conversion or recombination, that can either result in the homogenization of paralogous sequences 29, 30, 43 , or in an increase in genetic diversity 44, 45 . Illustrating the latter of these two scenarios, we found that for IGHV, SNP density was highest in regions including segmental duplications, particularly those with >95% sequence identity with their paralogs; similar trends were not noted for SNP density estimates calculated within non-SD repeat regions (Table 2) . Importantly, SDs with >95% identity comprised nearly twice the fraction of IGHV sequence than IGLV sequence, which may in part explain the differences observed in SNP density between IGH and IGL. It is also worth noting that the difference in the fraction of SDs between IGH and the light chain regions would be greater if the 222 Kbp of novel sequence (comprised primarily of SDs) identified in IGH by our previous study 7 were also included. In addition to this, assessments of CHM1 SNP density across autosomal telomeres and centromeres revealed that the telomeric region on chr14 containing IGHV had both elevated levels of SNP diversity, and increased SD . CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint overlap, compared to analogous regions on other autosomes. This suggests that the genomic location of IGHV has also likely contributed to the increased genetic variation we observe in this locus compared to IGKV and IGLV.', 'cite_spans': [{'start': 237, 'end': 239, 'text': '42', 'ref_id': 'BIBREF40'}, {'start': 533, 'end': 536, 'text': '29,', 'ref_id': 'BIBREF27'}, {'start': 537, 'end': 540, 'text': '30,', 'ref_id': 'BIBREF28'}, {'start': 541, 'end': 543, 'text': '43', 'ref_id': 'BIBREF41'}, {'start': 585, 'end': 588, 'text': '44,', 'ref_id': 'BIBREF42'}, {'start': 589, 'end': 591, 'text': '45', 'ref_id': 'BIBREF44'}], 'ref_spans': [{'start': 899, 'end': 908, 'text': '(Table 2)', 'ref_id': 'TABREF2'}], 'section': 'Discussion'}, {'text': 'The distinct clustering and genomic partitioning of V gene families within the IGL locus, and the observation that, compared to IGH and IGK, there are fewer IGL orphans present in other regions of the genome, has prompted the suggestion that IGL genes have undergone less \"evolutionary shuffling\" 22 , which may be linked to lower levels of diversity in the locus 15 and would be consistent with the results presented here. In comparison to the other two loci, we found IGKV to have the lowest locus-wide SNP density, nearly 6-fold lower than that observed in IGHV. Due to the large inverted duplication of the proximal and distal regions, over 80% of the IGKV locus consists of SDs with >95% sequence identity. This suggests that, unlike in IGHV, SDs may be responsible for sequence homogenization rather than an increase in SNP diversity.', 'cite_spans': [{'start': 364, 'end': 366, 'text': '15', 'ref_id': 'BIBREF13'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'The fact that we found evidence of a large tract of sequence exchange between the proximal and distal IGKV units lends support to this notion. However, fully understanding the relationship between segmental duplication and SNP density in the human IG regions will undoubtedly require further sequencing and comparisons of additional haplotypes. We must also acknowledge the potential for confounding effects related to the use of mosaic reference sequences for this comparison 13,17,31 , which were generated from multiple large insert libraries constructed from diploid tissues, in some cases of unknown ethnicity. Considering this, it is possible that a comparison of CH17 IG regions to references generated from individuals with different ethnic backgrounds could result in artifactual differences between loci. However, because our findings are supported by existing V gene allelic variation data at the population level, it seems unlikely that the difference in variability between loci is due to ethnic origin of the tissue. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint If the difference in SNP density observed here between the IG V gene clusters is in fact genuine, it raises the question of whether increased genetic diversity in IGH has any functional consequences. Given that SNP density within V gene coding regions in the CH17 haplotype was also higher in IGH compared to IGL and IGK, it could be speculated that mechanisms associated with an increased number of polymorphisms locus-wide in IGHV, could by default, result in greater IGHV gene diversity and a more variable expressed antibody repertoire. Intriguingly, in natural antibodies, the IG heavy chains are considered to play a more prominent role in epitope binding than IG light chains, although this is primarily attributed to residues of the third complementary determining region (CDRH3) not encoded by IGHV gene segments 46 ', 'cite_spans': [{'start': 2002, 'end': 2004, 'text': '46', 'ref_id': 'BIBREF45'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': '. CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'Materials and methods'}, {'text': 'BAC-end reads from the CHORI-17 hydatidiform mole BAC library mapping to the GRCh37 reference genome were used to identify and select clones within the two light chain regions. The IGL and IGK genes are located at distinct loci in the genome, 22q11.2 and 2p11.2, respectively 2,13,17,49-51 . Nine clones mapping to the IGL region, and eight clones mapping to IGK were picked for complete sequencing. As described in Watson et al 7 clones were shotgun sequenced using high quality capillary-based Sanger sequencing and assemblies were constructed and finished on a per clone basis. Fully assembled overlapping BAC clones were then used to create contiguous assemblies spanning the two IG light chain regions using SeqMan Pro (DNA Star, Lasergene, Wisconsin, USA).', 'cite_spans': [], 'ref_spans': [], 'section': 'Sequencing of IGL and IGK regions from the CHORI-17 BAC library'}, {'text': 'Sequences of all functional and open reading frame (ORF) C, J, and V genes (based on IMGT classification) were downloaded from IMGT and Vega databases (www.imgt.org, vega.sanger.ac.uk). All sequences were aligned to the completed contigs of each locus using SeqMan Pro, the positions of which were confirmed using BLAST 52 . Sequences corresponding to each of the mapped C, J, and V genes were extracted from the CH17 contigs, and alleles at each locus were assigned using IMGT V-QUEST 53, 54 . \"Novel\" alleles were defined as those not found in the IMGT database. To search for potential variants in previously characterized regulatory sequences, SNPs determined from alignments of the CH17 haplotype and references The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint from UCSC (www.genome.ucsc.edu), and SNP/gene region overlap was assessed using BEDTools version 2.1 55 . For those genes in which a SNP was found to occur within the defined regions, sequences in question from the CH17 and Kawasaki haplotypes were aligned, visually inspected, and compared to previously identified motifs.', 'cite_spans': [{'start': 320, 'end': 322, 'text': '52', 'ref_id': 'BIBREF51'}, {'start': 486, 'end': 489, 'text': '53,', 'ref_id': 'BIBREF52'}, {'start': 490, 'end': 492, 'text': '54', 'ref_id': 'BIBREF53'}, {'start': 967, 'end': 969, 'text': '55', 'ref_id': 'BIBREF54'}], 'ref_spans': [], 'section': 'Annotation of V, J, and C genes and regulatory regions from BAC clones'}, {'text': 'Using the program miropeats 56 CH17 contigs for IGK and IGL loci were compared individually to the sequences from the Kawasaki haplotypes (IGL, accession NG_000002.1;', 'cite_spans': [], 'ref_spans': [], 'section': 'Analyses of structural variants identified in CH17 IGL and IGK BAC clones'}, {'text': 'IGK-proximal/distal, accession NG_000834.1/NG_000833.1). The outputs for each comparison were visually inspected for potential regions of structural variation. Putative breakpoints for the single variant identified were determined by creating a multi-sequence alignment using sequences from the IGL Kawasaki haplotype and novel BAC clone that spanned the regions of the predicted variant-associated breakpoints. Multi-sequence alignments were generated and visualized in SeqMan Pro. Gene prediction was carried out using Genscan (genes.mit.edu/GENSCAN.html; 57 ), following by BLAST using the non-redundant gene database.', 'cite_spans': [], 'ref_spans': [], 'section': 'Analyses of structural variants identified in CH17 IGL and IGK BAC clones'}, {'text': 'For comparisons of proximal and distal V regions of the IGK locus from both the CH17 and Kawasaki haplotypes, only paralogous sequence shared between the proximal and distal regions were considered (i.e., sequence spanning the genes IGKV1-6, IGKV1-5, IGKV5-2, and IGKV4-1 was excluded, as these genes do have paralogous duplicates in the distal IGKV region).', 'cite_spans': [], 'ref_spans': [], 'section': 'Comparisons of IGK proximal and distal sequence similarity and recombination analysis'}, {'text': 'Base pair differences were collated based on pair-wise global alignments made between the Kawasaki proximal sequence and all other proximal and distal sequences from the Kawasaki and The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint CH17 haplotypes. Global alignments and variant calls were carried out using \"run-mummer3\" and \"combineMUMs\" commands in MUMmer3.0 58 . A sequence similarity plot was then generated for each pair-wise comparison using 10 kbp windows with a sliding size of 500 bp, as reported previously 17 . Sequences, ~22.5 kbp in length, from the region suspected to harbor a potential recombination event between proximal and distal regions, were extracted from each haplotype (proximal and distal) and aligned using ClustalW 59 within ebioX (http://www.ebioinformatics.org/ebiox/). Recombination/gene conversion analysis based on this alignment was conducted using the Difference of Sums of Squares (DSS) method within TOPALi v2 60, 61 . This method is based on comparing the branching patterns of two trees constructed using the first and second halves of sequence alignments within a given window of a larger alignment being analyzed; the fit between these two trees and the calculation of DSS is measured using the sum of squares. Windows in which trees differ significantly between the two halves are scored with high DSS values, and are thus candidate sites for recombination. The parameters used here for this analysis were as follows: a window size of 2.5 Kbp with a step size of 100 bp; the Jukes-Cantor substitution model for calculating distance matrices; 500 bootstrap iterations to test for significance; and the analysis was conducted in both forward and reverse directions along the alignment.', 'cite_spans': [{'start': 618, 'end': 620, 'text': '17', 'ref_id': 'BIBREF15'}, {'start': 1048, 'end': 1051, 'text': '60,', 'ref_id': 'BIBREF59'}, {'start': 1052, 'end': 1054, 'text': '61', 'ref_id': 'BIBREF60'}], 'ref_spans': [], 'section': 'Comparisons of IGK proximal and distal sequence similarity and recombination analysis'}, {'text': 'Locus-wide SNPs were first called in the IGLV, IGKV, and IGHV regions of the CH17 haplotypes by conducting global alignments of the CH17 and Kawasaki or Matsuda haplotypes. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint (https://hyperbrowser.uio.no). We created \"case-control\" tracks including coordinates overlapped (SD) and not overlapped (non-SD) by SDs in the IGHV/IGKV regions, and then carried out tests for enrichments of CH17 SNPs in these regions by permuting the SD and non-SD status for each IGHV and IGKV set of coordinates (Monte Carlo Simulations = 10,000). The observed enrichments were then compared to the simulated datasets to calculate P values for IGHV and IGKV region analyses. For genome-wide centromere/telomere analysis, we used CHM1 variants, and SNP densities in telomeric/centromeric regions were estimated twice independently using telomere/centromere coordinates extended by either 1Mbp or 3Mbp.', 'cite_spans': [], 'ref_spans': [], 'section': 'Analysis of IG loci genomic features, locus-wide alignments, and SNP discovery'}, {'text': '. CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint GenBank Accessions: ', 'cite_spans': [], 'ref_spans': [], 'section': 'Analysis of IG loci genomic features, locus-wide alignments, and SNP discovery'}, {'text': 'Tiling path of sequenced CH17 BAC clones and functional and ORF IGKV genes annotated on GRCh37 and CH17 are depicted by filled boxes, with corresponding locus and allele identifiers located above and below the haplotypes. Genes/alleles shared between haplotypes are indicated by filled green boxes. Genes and alleles that are unique to CH17 (not present in the NCBI reference/Kawasaki haplotypes) are indicated by boxes with other colors (red for nonsynonymous and blue for synonymous). Filled blue circles denote loci at which proximal or distal alleles were observed at the alternate locus (e.g., proximal allele at distal locus); and orange circles denote allelic differences between the haplotypes with respect to regulatory elements. The novel sequence extending into the gap in the CH17 IGK path is shown in red. The 21bp indel polymorphism upstream of IGKV1D-8 is indicated with a red arrow.', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure 2. IGKV gene comparison between CH17 and Kawasaki haplotypes'}, {'text': '. CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. ', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure 2. IGKV gene comparison between CH17 and Kawasaki haplotypes'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': \"Janeway's immunobiology. 7 edn\", 'authors': [{'first': 'K', 'middle': [], 'last': 'Murphy', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Travers', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Walport', 'suffix': ''}], 'year': 2007, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF1': {'ref_id': 'b1', 'title': 'The immunoglobulin FactsBook', 'authors': [{'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Lefranc', 'suffix': ''}], 'year': 2001, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'Somatic generation of antibody diversity', 'authors': [{'first': 'S', 'middle': [], 'last': 'Tonegawa', 'suffix': ''}], 'year': 1983, 'venue': 'Nature', 'volume': '302', 'issn': '', 'pages': '575--581', 'other_ids': {}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Individual variation in the germline Ig gene repertoire inferred from variable region gene rearrangements', 'authors': [{'first': 'S', 'middle': ['D'], 'last': 'Boyd', 'suffix': ''}], 'year': 2010, 'venue': 'J Immunol', 'volume': '184', 'issn': '', 'pages': '6986--6992', 'other_ids': {'DOI': ['10.4049/jimmunol.1000445']}}, 'BIBREF4': {'ref_id': 'b4', 'title': 'Naive antibody gene-segment frequencies are heritable and unaltered by chronic lymphocyte ablation', 'authors': [{'first': 'J', 'middle': [], 'last': 'Glanville', 'suffix': ''}], 'year': 2011, 'venue': 'Proc Natl Acad Sci U S A', 'volume': '108', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1073/pnas.1107498108']}}, 'BIBREF5': {'ref_id': 'b5', 'title': 'The inference of phased haplotypes for the immunoglobulin H chain V region gene loci by analysis of VDJ gene rearrangements', 'authors': [{'first': 'M', 'middle': ['J'], 'last': 'Kidd', 'suffix': ''}], 'year': 2012, 'venue': 'J Immunol', 'volume': '188', 'issn': '', 'pages': '1333--1340', 'other_ids': {'DOI': ['10.4049/jimmunol.1102097']}}, 'BIBREF6': {'ref_id': 'b6', 'title': 'Complete haplotype sequence of the human immunoglobulin heavychain variable, diversity, and joining genes and characterization of allelic and copynumber variation', 'authors': [{'first': 'C', 'middle': ['T'], 'last': 'Watson', 'suffix': ''}], 'year': 2013, 'venue': 'Am J Hum Genet', 'volume': '92', 'issn': '', 'pages': '530--546', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'The immunoglobulin heavy chain locus: genetic variation, missing data, and implications for human disease', 'authors': [{'first': 'C', 'middle': ['T'], 'last': 'Watson', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Breden', 'suffix': ''}], 'year': 2012, 'venue': 'Genes and immunity', 'volume': '13', 'issn': '', 'pages': '363--373', 'other_ids': {'DOI': ['10.1038/gene.2012.12']}}, 'BIBREF8': {'ref_id': 'b8', 'title': 'The human immunoglobulin heavy diversity (IGHD) and joining (IGHJ) segments', 'authors': [{'first': 'M', 'middle': [], 'last': 'Ruiz', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Pallares', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Contet', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Barbi', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}], 'year': 1999, 'venue': 'Exp Clin Immunogenet', 'volume': '16', 'issn': '', 'pages': '173--184', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'A defective Vkappa A2 allele in Navajos which may play a role in increased susceptibility to haemophilus influenzae type b disease', 'authors': [{'first': 'A', 'middle': ['J'], 'last': 'Feeney', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Atkinson', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Cowan', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Escuro', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Lugo', 'suffix': ''}], 'year': 1996, 'venue': 'J Clin Invest', 'volume': '97', 'issn': '', 'pages': '2277--2282', 'other_ids': {}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Structural and functional bases for broad-spectrum neutralization of avian and human influenza A viruses', 'authors': [{'first': 'J', 'middle': [], 'last': 'Sui', 'suffix': ''}], 'year': 2009, 'venue': 'Nature structural & molecular biology', 'volume': '16', 'issn': '', 'pages': '265--273', 'other_ids': {'DOI': ['10.1038/nsmb.1566']}}, 'BIBREF11': {'ref_id': 'b11', 'title': 'Identification of novel susceptibility Loci for kawasaki disease in a Han chinese population by a genome-wide association study', 'authors': [{'first': 'F.-J', 'middle': [], 'last': 'Tsai', 'suffix': ''}], 'year': 2011, 'venue': 'PLoS One', 'volume': '6', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF12': {'ref_id': 'b12', 'title': 'Polymorphism of immunoglobulin lambda constant region genes in populations from France, Lebanon and Tunisia', 'authors': [{'first': 'N', 'middle': [], 'last': 'Ghanem', 'suffix': ''}], 'year': 1988, 'venue': 'Exp Clin Immunogenet', 'volume': '5', 'issn': '', 'pages': '186--195', 'other_ids': {}}, 'BIBREF13': {'ref_id': 'b13', 'title': 'Allelic polymorphisms and RFLP in the human immunoglobulin lambda light chain locus', 'authors': [{'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Pallares', 'suffix': ''}, {'first': 'J', 'middle': ['P'], 'last': 'Frippiat', 'suffix': ''}], 'year': 1999, 'venue': 'Hum Genet', 'volume': '104', 'issn': '', 'pages': '361--369', 'other_ids': {}}, 'BIBREF14': {'ref_id': 'b14', 'title': 'Variable amplification of immunoglobulin lambda light-chain genes in human populations', 'authors': [{'first': 'R', 'middle': ['A'], 'last': 'Taub', 'suffix': ''}], 'year': 1983, 'venue': 'Nature', 'volume': '304', 'issn': '', 'pages': '172--174', 'other_ids': {}}, 'BIBREF15': {'ref_id': 'b15', 'title': 'Evolutionary dynamics of the human immunoglobulin kappa locus and the germline repertoire of the Vkappa genes', 'authors': [{'first': 'K', 'middle': [], 'last': 'Kawasaki', 'suffix': ''}], 'year': 2001, 'venue': 'Eur J Immunol', 'volume': '31', 'issn': '', 'pages': '1017--1028', 'other_ids': {}}, 'BIBREF16': {'ref_id': 'b16', 'title': 'A bacterial artificial chromosome library for sequencing the complete human genome', 'authors': [{'first': 'K', 'middle': [], 'last': 'Osoegawa', 'suffix': ''}], 'year': 2001, 'venue': 'Genome Res', 'volume': '11', 'issn': '', 'pages': '483--496', 'other_ids': {}}, 'BIBREF17': {'ref_id': 'b17', 'title': 'A new apparently functional IGVK gene (VkLa) present in some individuals only', 'authors': [{'first': 'L', 'middle': [], 'last': 'Juul', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Hougs', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Barington', 'suffix': ''}], 'year': 1998, 'venue': 'Immunogenetics', 'volume': '48', 'issn': '', 'pages': '40--46', 'other_ids': {}}, 'BIBREF18': {'ref_id': 'b18', 'title': 'An unusual allelic form of the immunoglobulin lambda constant region genes in the Japanese', 'authors': [{'first': 'P', 'middle': ['H'], 'last': 'Kay', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Moriuchi', 'suffix': ''}, {'first': 'P', 'middle': ['J'], 'last': 'Ma', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Saueracker', 'suffix': ''}], 'year': 1992, 'venue': 'Immunogenetics', 'volume': '35', 'issn': '', 'pages': '341--343', 'other_ids': {}}, 'BIBREF19': {'ref_id': 'b19', 'title': 'Genomic EcoRI polymorphism and cosmid sequencing reveal an insertion/deletion and a new IGLV5 allele in the human immunoglobulin lambda variable locus (22q11.2/IGLV)', 'authors': [{'first': 'C', 'middle': [], 'last': 'Moraes Junta', 'suffix': ''}, {'first': 'G', 'middle': ['A'], 'last': 'Passos', 'suffix': ''}], 'year': 2003, 'venue': 'Immunogenetics', 'volume': '55', 'issn': '', 'pages': '10--15', 'other_ids': {'DOI': ['10.1007/s00251-003-0549-x']}}, 'BIBREF20': {'ref_id': 'b20', 'title': 'Immunoglobulin lambda light chain orphons on human chromosome 8q11.2', 'authors': [{'first': 'J', 'middle': ['P'], 'last': 'Frippiat', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Dard', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Marsh', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Winter', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}], 'year': 1997, 'venue': 'Eur J Immunol', 'volume': '27', 'issn': '', 'pages': '1260--1265', 'other_ids': {}}, 'BIBREF21': {'ref_id': 'b21', 'title': 'Substitution patterns in alleles of immunoglobulin V genes in humans and mice', 'authors': [{'first': 'T', 'middle': [], 'last': 'Romo-Gonzalez', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Vargas-Madrazo', 'suffix': ''}], 'year': 2006, 'venue': 'Mol Immunol', 'volume': '43', 'issn': '', 'pages': '731--744', 'other_ids': {'DOI': ['10.1016/j.molimm.2005.03.018']}}, 'BIBREF22': {'ref_id': 'b22', 'title': 'The human immunoglobulin heavy variable genes', 'authors': [{'first': 'N', 'middle': [], 'last': 'Pallares', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Lefebvre', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Contet', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Matsuda', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}], 'year': 1999, 'venue': 'Exp Clin Immunogenet', 'volume': '16', 'issn': '', 'pages': '36--60', 'other_ids': {}}, 'BIBREF23': {'ref_id': 'b23', 'title': 'An integrated map of genetic variation from 1,092 human genomes', 'authors': [{'first': 'Genomes', 'middle': [], 'last': 'Project', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2012, 'venue': 'Nature', 'volume': '491', 'issn': '', 'pages': '56--65', 'other_ids': {'DOI': ['10.1038/nature11632']}}, 'BIBREF24': {'ref_id': 'b24', 'title': 'The human immunoglobulin kappa locus on yeast artificial chromosomes (YACs)', 'authors': [{'first': 'J', 'middle': [], 'last': 'Brensing-Kuppers', 'suffix': ''}, {'first': 'I', 'middle': [], 'last': 'Zocher', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Thiebe', 'suffix': ''}, {'first': 'H', 'middle': ['G'], 'last': 'Zachau', 'suffix': ''}], 'year': 1997, 'venue': 'Gene', 'volume': '191', 'issn': '', 'pages': '173--181', 'other_ids': {}}, 'BIBREF25': {'ref_id': 'b25', 'title': 'The human immunoglobulin kappa variable (IGKV) genes and joining (IGKJ) segments', 'authors': [{'first': 'V', 'middle': [], 'last': 'Barbie', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}], 'year': 1998, 'venue': 'Exp Clin Immunogenet', 'volume': '15', 'issn': '', 'pages': '171--183', 'other_ids': {}}, 'BIBREF26': {'ref_id': 'b26', 'title': 'Composite human VK genes and a model of their evolution', 'authors': [{'first': 'H', 'middle': ['R'], 'last': 'Jaenichen', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Pech', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Lindenmaier', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Wildgruber', 'suffix': ''}, {'first': 'H', 'middle': ['G'], 'last': 'Zachau', 'suffix': ''}], 'year': 1984, 'venue': 'Nucleic Acids Res', 'volume': '12', 'issn': '', 'pages': '5249--5263', 'other_ids': {}}, 'BIBREF27': {'ref_id': 'b27', 'title': 'Primate segmental duplications: crucibles of evolution, diversity and disease', 'authors': [{'first': 'J', 'middle': ['A'], 'last': 'Bailey', 'suffix': ''}, {'first': 'E', 'middle': ['E'], 'last': 'Eichler', 'suffix': ''}], 'year': 2006, 'venue': 'Nat Rev Genet', 'volume': '7', 'issn': '', 'pages': '552--564', 'other_ids': {}}, 'BIBREF28': {'ref_id': 'b28', 'title': 'Recent duplication, domain accretion and the dynamic mutation of the human genome', 'authors': [{'first': 'E', 'middle': ['E'], 'last': 'Eichler', 'suffix': ''}], 'year': 2001, 'venue': 'Trends Genet', 'volume': '17', 'issn': '', 'pages': '661--669', 'other_ids': {}}, 'BIBREF29': {'ref_id': 'b29', 'title': 'The complete nucleotide sequence of the human immunoglobulin heavy chain variable region locus', 'authors': [{'first': 'F', 'middle': [], 'last': 'Matsuda', 'suffix': ''}], 'year': 1998, 'venue': 'The Journal of experimental medicine', 'volume': '188', 'issn': '', 'pages': '2151--2162', 'other_ids': {}}, 'BIBREF30': {'ref_id': 'b30', 'title': 'Substantial regional variation in substitution rates in the human genome: importance of GC content, gene density, and telomere-specific effects', 'authors': [{'first': 'P', 'middle': ['F'], 'last': 'Arndt', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Hwa', 'suffix': ''}, {'first': 'D', 'middle': ['A'], 'last': 'Petrov', 'suffix': ''}], 'year': 2005, 'venue': 'J Mol Evol', 'volume': '60', 'issn': '', 'pages': '748--763', 'other_ids': {}}, 'BIBREF31': {'ref_id': 'b31', 'title': 'The reported germline repertoire of human immunoglobulin kappa chain genes is relatively complete and accurate', 'authors': [{'first': 'A', 'middle': ['M'], 'last': 'Collins', 'suffix': ''}], 'year': 2008, 'venue': 'Immunogenetics', 'volume': '60', 'issn': '', 'pages': '669--676', 'other_ids': {}}, 'BIBREF32': {'ref_id': 'b32', 'title': 'Divergent human populations show extensive shared IGK rearrangements in peripheral blood B cells', 'authors': [{'first': 'K', 'middle': ['J L'], 'last': 'Jackson', 'suffix': ''}], 'year': 2012, 'venue': 'Immunogenetics', 'volume': '64', 'issn': '', 'pages': '3--14', 'other_ids': {}}, 'BIBREF33': {'ref_id': 'b33', 'title': 'Genomic screening by 454 pyrosequencing identifies a new human IGHV gene and sixteen other new IGHV allelic variants', 'authors': [{'first': 'Y', 'middle': [], 'last': 'Wang', 'suffix': ''}], 'year': 2011, 'venue': 'Immunogenetics', 'volume': '63', 'issn': '', 'pages': '259--265', 'other_ids': {}}, 'BIBREF34': {'ref_id': 'b34', 'title': 'Masquerading repeats: paralogous pitfalls of the human genome', 'authors': [{'first': 'E', 'middle': ['E'], 'last': 'Eichler', 'suffix': ''}], 'year': 1998, 'venue': 'Genome Res', 'volume': '8', 'issn': '', 'pages': '758--762', 'other_ids': {}}, 'BIBREF35': {'ref_id': 'b35', 'title': 'Chromosomal regions containing high-density and ambiguously mapped putative single nucleotide polymorphisms (SNPs) correlate with segmental duplications in the human genome', 'authors': [{'first': 'X', 'middle': [], 'last': 'Estivill', 'suffix': ''}], 'year': 1987, 'venue': 'Hum Mol Genet', 'volume': '11', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF36': {'ref_id': 'b36', 'title': 'Decreased frequency of rearrangement due to the synergistic effect of nucleotide changes in the heptamer and nonamer of the recombination signal sequence of the V kappa gene A2b, which is associated with increased susceptibility of Navajos to Haemophilus influenzae type b disease', 'authors': [{'first': 'B', 'middle': [], 'last': 'Nadel', 'suffix': ''}], 'year': 1998, 'venue': 'J Immunol', 'volume': '161', 'issn': '', 'pages': '6068--6073', 'other_ids': {}}, 'BIBREF37': {'ref_id': 'b37', 'title': 'Structural analysis of substitution patterns in alleles of human immunoglobulin VH genes', 'authors': [{'first': 'T', 'middle': [], 'last': 'Romo-Gonzalez', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Vargas-Madrazo', 'suffix': ''}], 'year': 2005, 'venue': 'Mol Immunol', 'volume': '42', 'issn': '', 'pages': '1085--1097', 'other_ids': {'DOI': ['10.1016/j.molimm.2004.11.004']}}, 'BIBREF38': {'ref_id': 'b38', 'title': 'Polymorphisms and haplotypes in the human immunoglobulin kappa locus', 'authors': [{'first': 'W', 'middle': [], 'last': 'Pargent', 'suffix': ''}, {'first': 'K', 'middle': ['F'], 'last': 'Schable', 'suffix': ''}, {'first': 'H', 'middle': ['G'], 'last': 'Zachau', 'suffix': ''}], 'year': 1991, 'venue': 'Eur J Immunol', 'volume': '21', 'issn': '', 'pages': '1829--1835', 'other_ids': {'DOI': ['10.1002/eji.1830210808']}}, 'BIBREF39': {'ref_id': 'b39', 'title': 'The immunoglobulin kappa locus: polymorphism and haplotypes of Caucasoid and non-Caucasoid individuals', 'authors': [{'first': 'G', 'middle': [], 'last': 'Schaible', 'suffix': ''}, {'first': 'G', 'middle': ['A'], 'last': 'Rappold', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Pargent', 'suffix': ''}, {'first': 'H', 'middle': ['G'], 'last': 'Zachau', 'suffix': ''}], 'year': 1993, 'venue': 'Hum Genet', 'volume': '91', 'issn': '', 'pages': '261--267', 'other_ids': {}}, 'BIBREF40': {'ref_id': 'b40', 'title': 'Segmental duplications and copy-number variation in the human genome', 'authors': [{'first': 'A', 'middle': ['J'], 'last': 'Sharp', 'suffix': ''}], 'year': 2005, 'venue': 'Am J Hum Genet', 'volume': '77', 'issn': '', 'pages': '78--88', 'other_ids': {'DOI': ['10.1086/431652']}}, 'BIBREF41': {'ref_id': 'b41', 'title': 'Gene conversion homogenizes the CMT1A paralogous repeats', 'authors': [{'first': 'M', 'middle': ['E'], 'last': 'Hurles', 'suffix': ''}], 'year': 2001, 'venue': 'BMC Genomics', 'volume': '2', 'issn': '', 'pages': '11--11', 'other_ids': {}}, 'BIBREF42': {'ref_id': 'b42', 'title': 'Segmental duplications and gene conversion: Human luteinizing hormone/chorionic gonadotropin beta gene cluster', 'authors': [{'first': 'P', 'middle': [], 'last': 'Hallast', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Nagirnaja', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Margus', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Laan', 'suffix': ''}], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF44': {'ref_id': 'b44', 'title': 'Signatures of selection and gene conversion associated with human color vision variation', 'authors': [{'first': 'B', 'middle': ['C'], 'last': 'Verrelli', 'suffix': ''}, {'first': 'S', 'middle': ['A'], 'last': 'Tishkoff', 'suffix': ''}], 'year': 2004, 'venue': 'Am J Hum Genet', 'volume': '75', 'issn': '', 'pages': '363--375', 'other_ids': {}}, 'BIBREF45': {'ref_id': 'b45', 'title': 'Diversity in the CDR3 region of V(H) is sufficient for most antibody specificities', 'authors': [{'first': 'J', 'middle': ['L'], 'last': 'Xu', 'suffix': ''}, {'first': 'M', 'middle': ['M'], 'last': 'Davis', 'suffix': ''}], 'year': 2000, 'venue': 'Immunity', 'volume': '13', 'issn': '', 'pages': '37--45', 'other_ids': {}}, 'BIBREF46': {'ref_id': 'b46', 'title': 'Molecular signatures of hemagglutinin stem-directed heterosubtypic human neutralizing antibodies against influenza A viruses', 'authors': [{'first': 'Y', 'middle': [], 'last': 'Avnir', 'suffix': ''}], 'year': 2014, 'venue': 'PLoS Pathog', 'volume': '10', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF47': {'ref_id': 'b47', 'title': 'Identification of human neutralizing antibodies against MERS-CoV and their role in virus adaptive evolution', 'authors': [{'first': 'X.-C', 'middle': [], 'last': 'Tang', 'suffix': ''}], 'year': 2014, 'venue': 'Proc Natl Acad Sci U S A', 'volume': '111', 'issn': '', 'pages': '2018--2026', 'other_ids': {}}, 'BIBREF48': {'ref_id': 'b48', 'title': 'Organization of the human immunoglobulin lambda light-chain locus on chromosome 22q11.2', 'authors': [{'first': 'J', 'middle': ['P'], 'last': 'Frippiat', 'suffix': ''}], 'year': 1995, 'venue': 'Hum Mol Genet', 'volume': '4', 'issn': '', 'pages': '983--991', 'other_ids': {}}, 'BIBREF49': {'ref_id': 'b49', 'title': 'The immunoglobulin kappa locus-or-what has been learned from looking closely at one-tenth of a percent of the human genome', 'authors': [{'first': 'H', 'middle': ['G'], 'last': 'Zachau', 'suffix': ''}], 'year': 1993, 'venue': 'Gene', 'volume': '135', 'issn': '', 'pages': '167--173', 'other_ids': {}}, 'BIBREF50': {'ref_id': 'b50', 'title': 'The immunoglobulin kappa genes', 'authors': [{'first': 'H', 'middle': ['G'], 'last': 'Zachau', 'suffix': ''}], 'year': 1996, 'venue': 'Immunologist', 'volume': '135', 'issn': '', 'pages': '167--173', 'other_ids': {}}, 'BIBREF51': {'ref_id': 'b51', 'title': 'Basic local alignment search tool', 'authors': [{'first': 'S', 'middle': ['F'], 'last': 'Altschul', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Gish', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Miller', 'suffix': ''}, {'first': 'E', 'middle': ['W'], 'last': 'Myers', 'suffix': ''}, {'first': 'D', 'middle': ['J'], 'last': 'Lipman', 'suffix': ''}], 'year': 1990, 'venue': 'J Mol Biol', 'volume': '215', 'issn': '', 'pages': '403--410', 'other_ids': {}}, 'BIBREF52': {'ref_id': 'b52', 'title': 'IMGT/V-QUEST: the highly customized and integrated system for IG and TR standardized V-J and V-D-J sequence analysis', 'authors': [{'first': 'X', 'middle': [], 'last': 'Brochet', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Lefranc', 'suffix': ''}, {'first': 'V', 'middle': [], 'last': 'Giudicelli', 'suffix': ''}], 'year': 2008, 'venue': 'Nucleic Acids Res', 'volume': '36', 'issn': '', 'pages': '503--508', 'other_ids': {'DOI': ['10.1093/nar/gkn316']}}, 'BIBREF53': {'ref_id': 'b53', 'title': 'IMGT standardized analysis of the immunoglobulin (IG) and T cell receptor (TR) nucleotide sequences', 'authors': [{'first': 'V', 'middle': [], 'last': 'Giudicelli', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Brochet', 'suffix': ''}, {'first': 'M.-P', 'middle': [], 'last': 'Lefranc', 'suffix': ''}, {'first': '', 'middle': [], 'last': 'Imgt/V-Quest', 'suffix': ''}], 'year': 2011, 'venue': 'Cold Spring Harb Protoc', 'volume': '', 'issn': '', 'pages': '695--715', 'other_ids': {}}, 'BIBREF54': {'ref_id': 'b54', 'title': 'BEDTools: a flexible suite of utilities for comparing genomic features', 'authors': [{'first': 'A', 'middle': ['R'], 'last': 'Quinlan', 'suffix': ''}, {'first': 'I', 'middle': ['M'], 'last': 'Hall', 'suffix': ''}], 'year': 2010, 'venue': 'Bioinformatics', 'volume': '26', 'issn': '', 'pages': '841--842', 'other_ids': {'DOI': ['10.1093/bioinformatics/btq033']}}, 'BIBREF55': {'ref_id': 'b55', 'title': 'Miropeats: graphical DNA sequence comparisons', 'authors': [{'first': 'J', 'middle': ['D'], 'last': 'Parsons', 'suffix': ''}], 'year': 1995, 'venue': 'Comput Appl Biosci', 'volume': '11', 'issn': '', 'pages': '615--619', 'other_ids': {}}, 'BIBREF56': {'ref_id': 'b56', 'title': 'Prediction of complete gene structures in human genomic DNA', 'authors': [{'first': 'C', 'middle': [], 'last': 'Burge', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Karlin', 'suffix': ''}], 'year': 1997, 'venue': 'J Mol Biol', 'volume': '268', 'issn': '', 'pages': '78--94', 'other_ids': {}}, 'BIBREF57': {'ref_id': 'b57', 'title': 'Versatile and open software for comparing large genomes', 'authors': [{'first': 'S', 'middle': [], 'last': 'Kurtz', 'suffix': ''}], 'year': 2004, 'venue': 'Genome Biol', 'volume': '5', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF58': {'ref_id': 'b58', 'title': 'Multiple sequence alignment using ClustalW and ClustalX', 'authors': [{'first': 'J', 'middle': ['D'], 'last': 'Thompson', 'suffix': ''}, {'first': 'T', 'middle': ['J'], 'last': 'Gibson', 'suffix': ''}, {'first': 'D', 'middle': ['G'], 'last': 'Higgins', 'suffix': ''}], 'year': 2002, 'venue': 'Curr Protoc Bioinformatics Chapter', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF59': {'ref_id': 'b59', 'title': 'TOPAL 2.0: improved detection of mosaic sequences within multiple alignments', 'authors': [{'first': 'G', 'middle': [], 'last': 'Mcguire', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Wright', 'suffix': ''}], 'year': 2000, 'venue': 'Bioinformatics', 'volume': '16', 'issn': '', 'pages': '130--134', 'other_ids': {}}, 'BIBREF60': {'ref_id': 'b60', 'title': 'TOPALi: software for automatic identification of recombinant sequences within DNA multiple alignments', 'authors': [{'first': 'I', 'middle': [], 'last': 'Milne', 'suffix': ''}], 'year': 2004, 'venue': 'Bioinformatics', 'volume': '20', 'issn': '', 'pages': '1806--1807', 'other_ids': {'DOI': ['10.1093/bioinformatics/bth155']}}, 'BIBREF61': {'ref_id': 'b61', 'title': 'Segmental duplications: organization and impact within the current human genome project assembly', 'authors': [{'first': 'J', 'middle': ['A'], 'last': 'Bailey', 'suffix': ''}, {'first': 'A', 'middle': ['M'], 'last': 'Yavor', 'suffix': ''}, {'first': 'H', 'middle': ['F'], 'last': 'Massa', 'suffix': ''}, {'first': 'B', 'middle': ['J'], 'last': 'Trask', 'suffix': ''}, {'first': 'E', 'middle': ['E'], 'last': 'Eichler', 'suffix': ''}], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF63': {'ref_id': 'b63', 'title': 'Recent segmental duplications in the human genome', 'authors': [{'first': 'J', 'middle': ['A'], 'last': 'Bailey', 'suffix': ''}], 'year': 2002, 'venue': 'Science', 'volume': '297', 'issn': '', 'pages': '1003--1007', 'other_ids': {'DOI': ['10.1126/science.1072047']}}}, 'ref_entries': {'FIGREF0': {'text': 'Kbp of contiguous sequence (Figure 2). The proximal contig, containing four BACs, spanned from 170 Kbp downstream of IGKC to within the intron of IGKV2-40; thus, this contig lacked 10.9 Kbp of known sequence upstream of IGKV2-40 characterized in the initial genomic description of the locus, representing a small gap in the CH17 sequence. In the distal region, four complete BACs were assembled into a single contig spanning 60 Kbp upstream of IGLV2D-40 to 22 Kbp downstream of the most distal gene in the locus, IGHV3D-7. This sequence included ~49 kbp of additional sequence (compared to the Kawasaki haplotype) extending into the assembly gap between the proximal and distal units, which is predicted to be 800 kbp 17,26 . The sequence extending into the unsequenced gap is dominated by complex repeats (Supplementary Figure 1), likely contributing to the difficulty of completing assemblies in this region. Alleles at forty-four functional/ORF IGKV genes, 22 in each of the proximal and distal regions, as well as five IGKJ genes, and a single IGKC gene in the proximal region were characterized and compared to those found in the IGK Kawasaki haplotype (Figure 2; Supplementary', 'latex': None, 'type': 'figure'}, 'FIGREF1': {'text': 'Figure 3B), providing evidence that distal and proximal units have undergone sequence exchange, making the distal unit more similar to the proximal unit in this region in theKawasaki', 'latex': None, 'type': 'figure'}, 'FIGREF2': {'text': 'receptor/leukocyte Ig-like receptor and T cell receptor alpha loci, calculated at ~0.0016 for both regions (Steinberg et al., manuscript in preparation).Similar to patterns noted from V gene allelic diversity and substitution patterns, fewer SVs have also been reported in IGLV and IGKV compared to IGHV. In IGL, for example, only three insertion-deletion variants of functional genes have been identified, involving IGLV1-50, IGLV8-61, and IGLV5-3915,21 ; although, the deletion of IGLV8-61 was identified in only a single individual. Likewise, in the IGKV locus, aside from an identified rare haplotype containing a deletion of the entire distal IGKV gene cluster40,41 , only a single functional V gene insertion, including the gene IGKV1-NL1, has been identified19 . Limited evidence suggesting putative', 'latex': None, 'type': 'figure'}, 'FIGREF4': {'text': '. However, there are several examples demonstrating essential functions of IGHV germline-encoded variation in antigen specificity; for example, residues encoded by germline IGHV1-69 alleles have been shown to make important contributions to neutralizing antibody responses against influenza, hepatitis C, and Middle East Respiratory Syndrome coronavirus 11,47,48 . Whether increased genetic diversity in IGHV is associated with the dominant role of the heavy chain in antibody function remains to be seen. In addition, there is still much to be learned about the contribution of IG genetic polymorphism to variability in expressed repertoires and the implications of this variation for susceptibility to infectious and autoimmune diseases, responses to therapeutic antibodies and vaccines, and other clinical outcomes. These outstanding questions stress the importance of accurately representing standing genetic variation in the human IG loci.', 'latex': None, 'type': 'figure'}, 'FIGREF5': {'text': 'generated previously by Kawasaki et al. (we refer to these as \"Kawasaki haplotypes\" 13,17 ) were tested for overlap with 250 bp regions immediately up and down stream of functional and ORF gene exons. Exon coordinates as determined by Vega annotations for each gene were downloaded', 'latex': None, 'type': 'figure'}, 'FIGREF7': {'text': 'IGHV, sequence 10 kbp downstream of IGHV6-1 (the most proximal IGHV gene) to 49 kbp upstream of IGHV3-74 (the most distal IGHV gene) from both CH17 and Matsuda haplotypes 7; structural variants identified between the two haplotypes were removed, leaving 834,802 bp of aligned sequence. For IGKV and IGLV, 10 kbp downstream of the most proximal V gene and 10 Kbp upstream of the most distal V gene were compared totalling 858,805 and 858,244 bp of aligned sequence, respectively (the large insertion variant identified in CH17 within IGLV was not included). The lengths of aligned sequence are based on bp coordinates in the Matsuda (IGH) and Kawasaki (IGL and IGK) haplotypes. CH17 and reference haplotypes were aligned on a per locus basis and SNPs were determined from the resulting alignments using the same commands from MUMmer3.0 referenced above. Single nucleotide variants called from CHM1 cell line DNA using whole-genome paired-end Illumina short-read sequencing (Steinberg et al., manuscript in preparation; NCBI BioProject ID: 176729) were used to assess the accuracy of variants called from the CH17 assemblies. To do this, the NCBI remap tool (http://www.ncbi.nlm.nih.gov/genome/tools/remap) was used to convert the GRCh37 coordinates to the CHM1_1.1 assembly coordinates (Genbank Accession: GCA_000306695.2), which were then compared with variant calls generated from the alignment of CHM1 Illumina data to the CHM1_1.1 assemblies. Variants in both callsets were flagged as unsupported by the Illumina data, and deemed errors in the CHM1_1.1 assembly. The coordinates of V gene exon boundaries based on the Vega gene annotation track, repeat content (RepeatMasker 3.2.7; www.repeatmasker.org), SDs 62,63 , and centromere/telomere coordinates were downloaded from UCSC (www.genome.ucsc.edu). Percent sequence identity values for SDs were also downloaded from UCSC and used for partitioning the SD datasets.Locus coverage and SNP density values were calculated using BEDTools version 2.1. We assessed the statistical significance for the observed enrichments of CH17 SNP densities within SDs in the IGHV and IGKV regions using the Genomic Hyperbrowser', 'latex': None, 'type': 'figure'}, 'FIGREF8': {'text': 'Legends', 'latex': None, 'type': 'figure'}, 'FIGREF9': {'text': 'IGLV gene comparison between CH17 and Kawasaki haplotypes. Tiling path of sequenced CH17 BAC clones and functional and ORF IGLV genes annotated on GRCh37 and CH17 are depicted by filled boxes, with corresponding locus and allele identifiers located above and below the haplotypes. Genes/alleles shared between haplotypes are indicated by filled green boxes. Genes and alleles that are unique to CH17 (not present in the NCBI reference/Kawasaki haplotypes) are indicated by boxes with other colors (red for non-synonymous and blue for synonymous). Filled purple circles denote novel alleles with polymorphisms resulting in a nonsense mutation (pseudogene). The 11kbp insertion in the CH17 IGL path is indicated.', 'latex': None, 'type': 'figure'}, 'FIGREF10': {'text': 'Detection of putative sequence exchange event between IGK proximal and distal regions. (A) Pair-wise alignments between proximal and distal segmental duplications in the CH17 and Kawasaki haplotypes (Abbreviations: Ka, Kawasaki; prox, proximal; dist, distal). The region where Ka-prox and Ka-dist show stronger similarity than CH17-dist and Ka-prox highlights a potential region of sequence exchange between the two CH17 units (red box). (B) Top panel shows a four-way sequence alignment of a 22.5 Kbp region from the proximal and distal units from within the red box in (A). Blue tick marks indicate bp SNP differences between the sequences. Upward pointing red arrows indicate boundaries of regions where the Kawasaki distal sequence aligns with a higher sequence similarity to the Kawasaki and CH17 proximal sequences than to the CH17 distal sequence, indicative of exchange between proximal and distal regions of the Kawasaki haplotype (sequence similarities: Ka-dist/CH17-dist=98.7%; Ka-dist/Ka-prox=99.7%). A DSS recombination analysis (McGuire and Wright, 2000; see methods) using the same four-way sequence alignment is shown in the bottom panel. The two peaks with the strongest DSS values (downward pointing red arrows) correspond to the predicted breakpoints shown in the top panel based on sequence similarity values. The dotted line across the chart indicates the significance threshold based on the null distribution of DSS values calculated assuming no recombination.', 'latex': None, 'type': 'figure'}, 'FIGREF11': {'text': 'Representation of SNPs identified in the CH17 IG V gene regions in public SNP databases. The number of SNPs identified in IGKV, IGLV, and IGHV gene regions based on alignments of CH17 to Matsuda and Kawasaki haplotypes (excluding gaps). The fraction of SNPs represented in dbSNP135 (top) and 1KG (bottom) databases (db) are shown, the total number of novel SNPs not found in either database is indicated at the bottom of the left panel.', 'latex': None, 'type': 'figure'}, 'TABREF0': {'text': 'For IGL, a single 1.25 Mbp contig was generated from 9 BACs spanning 175.7 Kbp upstream of the functional IGLV gene, IGLV4-69, to 191.3 Kbp downstream of IGLC7', 'latex': None, 'type': 'table'}, 'TABREF1': {'text': 'Table 1. IGK and IGL loci CH17 BAC clones and contig statistics Bp mismatches do not include differences involving microsatellite repeat sequence; ** Previously sequenced; NA, not applicable . CC-BY-NC 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint', 'latex': None, 'type': 'table'}, 'TABREF2': {'text': 'IG region genomic feature statistics and SNP density comparisons Calculations in parentheses include inverted segmental duplications between the proximal and distal units.2 Only those genes found in the CH17 and reference haplotypes were considered, excluding duplications and deletions described in IGHV(Watson et al., 2013).The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/006866 doi: bioRxiv preprint', 'latex': None, 'type': 'table'}}, 'back_matter': [{'text': 'We are grateful to Marie-Paule Lefranc ', 'cite_spans': [], 'ref_spans': [], 'section': 'Acknowledgements'}]}\n", - "{'paper_id': '4da8a87e614373d56070ed272487451266dce919', 'metadata': {'title': 'Bayesian mixture analysis for metagenomic community profiling', 'authors': [{'first': 'Sofia', 'middle': [], 'last': 'Morfopoulou', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University College London', 'location': {'postCode': 'WC1E 6BT', 'settlement': 'London'}}, 'email': ''}, {'first': 'Vincent', 'middle': [], 'last': 'Plagnol', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'University College London', 'location': {'postCode': 'WC1E 6BT', 'settlement': 'London'}}, 'email': ''}]}, 'abstract': [{'text': 'Deep sequencing of clinical samples is now an established tool for the detection of infectious pathogens, with direct medical applications. The large amount of data generated provides an opportunity to detect species even at very low levels, provided that computational tools can effectively interpret potentially complex metagenomic mixtures. Data interpretation is complicated by the fact that short sequencing reads can match multiple organisms and by the lack of completeness of existing databases, in particular for viral pathogens. This interpretation problem can be formulated statistically as a mixture model, where the species of origin of each read is missing, but the complete knowledge of all species present in the mixture helps with the individual reads assignment. Several analytical tools have been proposed to approximately solve this computational problem. Here, we show that the use of parallel Monte Carlo Markov chains (MCMC) for the exploration of the species space enables the identification of the set of species most likely to contribute to the mixture. The added accuracy comes at a cost of increased computation time. Our approach is useful for solving complex mixtures involving several related species. We designed our method specifically for the analysis of deep transcriptome sequencing datasets and with a particular focus on viral pathogen detection, but the principles are applicable more generally to all types of metagenomics mixtures. The work is implemented as a user friendly R package, available from CRAN: http://cran.r-project.org/web/packages/metaMix .', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}, {'text': 'Pathoscope identified 47 species. Of these 42 are members of the metagenomic community. 42 are the exact same strain, while 3 are either the same species but different strain or same genus but different species. However it fails to detect 68 species that are actually present in the mixture. Tuning the parameter that . CC-BY-NC-ND 4.0 International license is made available under a', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': 'Metagenomics can be defined as the analysis of a collection of DNA or RNA sequences originating from a single sample. In practice, its scope is broad and includes the analysis of a diverse set of samples such as gut microbiome (Qin et al., 2010) , (Minot et al., 2011) , environmental (Mizuno et al., 2013) or clinical (Willner et al., 2009) , (Negredo et al., 2011 ), (McMullan et al., 2012 samples. Among these applications, the discovery of viral pathogens is clearly relevant for clinical practice (Fancello et al., 2012) , (Chiu, 2013) . The traditional process of characterizing a virus through potentially difficult and time consuming culture techniques is being revolutionized by advances in high throughput sequencing. Potential benefits of sequence driven methodologies include a more rapid turnaround time (Quail et al., 2012) , combined with a largely unbiased approach in species detection, including the opportunity for unexpected discoveries.', 'cite_spans': [{'start': 227, 'end': 245, 'text': '(Qin et al., 2010)', 'ref_id': 'BIBREF21'}, {'start': 248, 'end': 268, 'text': '(Minot et al., 2011)', 'ref_id': 'BIBREF18'}, {'start': 285, 'end': 306, 'text': '(Mizuno et al., 2013)', 'ref_id': 'BIBREF19'}, {'start': 319, 'end': 341, 'text': '(Willner et al., 2009)', 'ref_id': 'BIBREF25'}, {'start': 344, 'end': 365, 'text': '(Negredo et al., 2011', 'ref_id': 'BIBREF20'}, {'start': 366, 'end': 391, 'text': '), (McMullan et al., 2012', 'ref_id': 'BIBREF17'}, {'start': 502, 'end': 525, 'text': '(Fancello et al., 2012)', 'ref_id': 'BIBREF8'}, {'start': 528, 'end': 540, 'text': '(Chiu, 2013)', 'ref_id': 'BIBREF4'}, {'start': 817, 'end': 837, 'text': '(Quail et al., 2012)', 'ref_id': 'BIBREF22'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'The analysis of shotgun sequencing data from metagenomic mixtures raises complex computational challenges. Part of the difficulty stems from the read length limitation of existing deep DNA sequencing technologies, an issue compounded by the extensive level of homology across viral and bacterial species. Another complication is the divergence of the microbial DNA sequences from the publicly available references. As a consequence, the assignment of a sequencing read to a database organism is often unclear. Lastly, the number of reads originating from a disease causing pathogen can be low (Barzon et al., 2013) . The pathogen contribution to the mixture depends on the biological context, the timing of sample extraction and the type of pathogen considered. Therefore, highly sensitive computational approaches are required.', 'cite_spans': [{'start': 593, 'end': 614, 'text': '(Barzon et al., 2013)', 'ref_id': 'BIBREF1'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'A first analytical problem is read classification, that is the assignment of a given sequencing read to a species. Several tools have been developed and these belong to two broadly defined classes: compositionbased and similarity-based approaches. The read classification based on sequence composition relies on the intrinsic features of the reads, such as CG content or oligonucleotide distributions. Methods include PhyloPythia (McHardy et al., 2007) , Phymm (Brady and Salzberg, 2009) , MetaCluster (Yang et al., 2010) . These tend to focus on major classes in a dataset and may not perform well on low-abundance populations (Kunin et al., 2008) . Additionally, results are usually reliable for longer reads only (Dröge and McHardy, 2012) .', 'cite_spans': [{'start': 418, 'end': 452, 'text': 'PhyloPythia (McHardy et al., 2007)', 'ref_id': None}, {'start': 461, 'end': 487, 'text': '(Brady and Salzberg, 2009)', 'ref_id': 'BIBREF2'}, {'start': 502, 'end': 521, 'text': '(Yang et al., 2010)', 'ref_id': 'BIBREF27'}, {'start': 628, 'end': 648, 'text': '(Kunin et al., 2008)', 'ref_id': 'BIBREF13'}, {'start': 716, 'end': 741, 'text': '(Dröge and McHardy, 2012)', 'ref_id': 'BIBREF6'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Similarity based methods, using homology search algorithms such as BLAST (Altschul et al., 1990) , are considered the most sensitive methods for read classification (Brady and Salzberg, 2009 ). One of the most popular tools using the output of a similarity search algorithm is MEGAN (Huson et al., 2007) . MEGAN addresses ambiguous matches by assigning reads that have multiple possible assignments to several species, to the taxonomic group containing all these species, or else their lowest common ancestor (LCA). This approach is accurate on a higher taxonomic level. However, it is lacking a formal solution to resolving ambiguous matches. A weakness of the similarity based methods is that a long tail of species, each supported only by a few reads can appear in the results. This results from the classification being decided one read at a time, in contrast to considering all reads simultaneously. Hybrid methods combining composition and similarity information such as PhymmBL (Brady and Salzberg, 2009) and RITA (MacDonald et al., 2012) also tackle one read at a time.', 'cite_spans': [{'start': 73, 'end': 96, 'text': '(Altschul et al., 1990)', 'ref_id': 'BIBREF0'}, {'start': 165, 'end': 190, 'text': '(Brady and Salzberg, 2009', 'ref_id': 'BIBREF2'}, {'start': 283, 'end': 303, 'text': '(Huson et al., 2007)', 'ref_id': 'BIBREF11'}, {'start': 985, 'end': 1011, 'text': '(Brady and Salzberg, 2009)', 'ref_id': 'BIBREF2'}, {'start': 1016, 'end': 1045, 'text': 'RITA (MacDonald et al., 2012)', 'ref_id': None}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Methods focused on the statistical inference of the set of present species as well as the estimation of their relative proportions, incorporate knowledge from all reads to assign each individual read to a species. From a statistical standpoint, this identification and quantification question can be thought of as an application of mixture models. These ideas have been applied in the metagenomics context in frequentist (GRAMMy (Xia et al., 2011) ) and Bayesian (Pathoscope (Francis et al., 2013) ) settings. GRAMMy formulates the problem as a finite mixture model, using the Expectation-Maximization (EM) algorithm to estimate the relative genome abundances. Pathoscope refines this process by penalizing reads with ambiguous matches in the presence of reads with unique matches and enforcing parsimony within a Bayesian context. Both methods work with unassembled sequence data and they are not currently setup to incorporate an initial short read assembly step, which could be achieved by assigning a higher weight to contigs formed by multiple reads. Fitting a mixture model is useful for the species relative abundance estimation, as well as the read to species assignment. A related but distinct question concerns the set of species which should be included in the mixture model. This question is closely related to the biological question of asking what species are present in the mixture. Including all species flagged as potential matches by the read classification can introduce a large number of species, often in the low thousands. Mixture models will, in this situation, identify a large number of species at low levels. This interpretation is appropriate in some applications. In many other cases, the expectation is that the underlying species set should be parsimonious and that some divergence with database species or sequencing errors can explain a large fraction of the non matching reads.', 'cite_spans': [{'start': 429, 'end': 447, 'text': '(Xia et al., 2011)', 'ref_id': 'BIBREF26'}, {'start': 475, 'end': 497, 'text': '(Francis et al., 2013)', 'ref_id': 'BIBREF9'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Hence, a better statistical formulation of the community profiling problem is the exploration of the candidate organisms state-space. In this context, non nested models can be compared based on their marginal likelihood. Within this Bayesian framework, readily interpretable probabilities, such as the posterior probabilities of species sets can be used to quantify the support for a species in the mixture. Finally, more complex hypotheses testing for example the number of viral species or the joint presence of two distinct organisms can be investigated.', 'cite_spans': [], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'The main challenge behind such a formulation is computational. Even with a relatively small number of species to consider, the number of subsets of this space that could explain the mixture grows exponentially. Efficient computational strategies are required to make this problem tractable. Here we show that this inference can be achieved for modern scale metagenomics datasets. Our strategy is based on parallel tempering, a Monte Carlo Markov Chain technique, using parallel computing to speed up the inference. We implemented these ideas in a user friendly R package called metaMix. metaMix produces posterior probabilities for various models as well as the relative abundances under each model. We demonstrate its potential using datasets from clinical samples as well as benchmark metagenomic datasets.', 'cite_spans': [], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'We first applied metaMix on a popular benchmark dataset, for which the community composition and the read assignment is known. We then analyzed RNA-Seq datasets from two clinical samples that were generated for diagnostic purposes. We compare our results with the ones produced by MEGAN version 5.3 and Pathoscope 2.0. Both methods are similarity-based. This property and more specifically their flexibility to work with BLASTx output, makes them better candidates for viral discovery compared to compositionbased methods. From the mixture model methods, we have chosen Pathoscope. We were also interested in comparing our results to the ones by GRAMMy, which was the first similarity-based method to use the idea of the mixture model. However, GRAMMy is designed for nucleotide-nucleotide comparisons (BLASTn), which is suboptimal for viral discovery. GRAMMy also only considers unassembled reads and requires that these are of the same fixed length. For these reasons, GRAMMy was not included in the comparison. Default parameters were used for all methods, unless stated otherwise.', 'cite_spans': [], 'ref_spans': [], 'section': 'Results'}, {'text': 'For the metaMix output, we reported organisms with a posterior probability greater than 0.9. The metaMix read support parameter r, which essentially sets the sensitivity/specificity of the method, has an impact on the number of reported species. A large r value can result in the method merging together strains that are differentiated by fewer reads than r. On the other hand a low r can have the opposite effect, whereby the methods splits a strain into two or more strains, by moving a few reads from one strain to a very similar one with which they have equally good matches.', 'cite_spans': [], 'ref_spans': [], 'section': 'Results'}, {'text': \"The user's choice for this key parameter r should be informed by the biological context. As an example, for the typical human clinical sample where the sample collection might have occurred some time after the infection has taken place, a low value in order to adopt a sensitive approach is reasonable. Hence, for viral identification in human clinical samples, a low and sensitive value (r = 10) is a reasonable choice. In a highly complex environmental metagenomic community where there is a plethora of species of similar abundances, the choice becomes less straightforward especially in the case of closely related strains. We set the default value for general community profiling in environmental samples to be r = 30. We also compare the output of metaMix for different values of this parameter. Supplementary Text 1 presents a detailed discussion on these settings as well as practical considerations.\", 'cite_spans': [], 'ref_spans': [], 'section': 'Results'}, {'text': 'The FAMeS artificial datasets (http://fames.jgi-psf.org/description.html), are simulated metagenomic datasets composed of random reads from 113 isolate microbial genomes present in IMG (Integrated Microbial Genomes) and sequenced at the DOE Joint Genome Institute. They are a popular choice to use as benchmark datasets for various metagenomics methods. Their suitability stems from the fact that the number of species that form the metagenomic community is known as well as their relative abundances. The FAMeS datasets have been designed to model real metagenomic communities in terms of complexity and phylogenetic composition.', 'cite_spans': [], 'ref_spans': [], 'section': 'FAMeS simHC dataset -closely related strains'}, {'text': 'There are three datasets: simHC, simMC, simLC corresponding to high, medium and low complexity of the metagenomic community respectively. The three methods were applied to simHC, as this is the highest complexity dataset, with many closely related strains with similar abundances and no dominant species. The lowest abundance is 255 reads out of 118,000 reads. The bioinformatics processing in this instance consisted of a BLASTn comparison to all NCBI bacterial genomes (ftp://ftp.ncbi.nlm.nih.gov/ genomes/Bacteria/all.fna.tar.gz). The number of genomes mapped, retrieved from the the BLASTn output was ∼2,500.', 'cite_spans': [], 'ref_spans': [], 'section': 'FAMeS simHC dataset -closely related strains'}, {'text': 'As discussed below, metaMix outperforms Pathoscope and MEGAN in the community profiling task (Table 1 ) and consequently in the relative abundance estimation (Table 2) .', 'cite_spans': [], 'ref_spans': [{'start': 93, 'end': 101, 'text': '(Table 1', 'ref_id': None}, {'start': 158, 'end': 167, 'text': '(Table 2)', 'ref_id': None}], 'section': 'FAMeS simHC dataset -closely related strains'}, {'text': 'To limit the complexity of the fit, we used the two step procedure described in the Methods and fully implemented in metaMix. We first fitted the mixture model with the complete set of 2,500 species and a limited run length of 500 iterations. Based on this analysis, we identified 1,312 species supported by at least one read and explored this state space. To limit the computational time, we also considered a stronger approximation, including only the 374 potential species supported by at least 10 sequencing reads. Both approaches generated similar results, albeit the more complex one with 1,312 potential species required the quadruple of the computation time (12h instead of 3h). metaMix identified 116 species, detecting successfully all the members of the metagenomic community (Supplementary Table S1 ). These were detected on the strain level except in four instances where a different strain of the same species, or different species within the same genus was detected. Four species were identified and not in the simulated dataset, hence can be considered as false positives (Supplementary Table S1 ). enforces the parsimonious results (any thetaPrior greater than 10), thereby removing the unique read penalty, Pathoscope behaves as a standard mixture model and identifies 165 species. With these settings, it identifies all but one members of the community. The organisms are identified at the strain level, except in three instances where it identified different species within the same genus. The major interpretation issue is the presence of a long tail of species (54 species) that are actually not present in the mixture (Supplementary Table S1 ). Pathoscope produced the results in one minute.', 'cite_spans': [], 'ref_spans': [{'start': 787, 'end': 810, 'text': '(Supplementary Table S1', 'ref_id': None}, {'start': 1088, 'end': 1111, 'text': '(Supplementary Table S1', 'ref_id': None}, {'start': 1641, 'end': 1665, 'text': '(Supplementary Table S1', 'ref_id': None}], 'section': 'metaMix'}, {'text': 'MEGAN identified 232 species. It discovered all original species of the community on the strain level, except for 9 instances where it identified the lowest common ancestor (LCA). Aside from the lack of strain or species specificity for 8% of the community members, the main issue is the long tail of false positives species. In the species summary provided by MEGAN, there are 119 species which are not actually present, but supported by a sufficient number of reads (default value: 50 reads) for MEGAN to include these in the output. It finished the computations in less than one minute.', 'cite_spans': [], 'ref_spans': [], 'section': 'MEGAN'}, {'text': 'The primary aim for metaMix is to be a diagnostic tool and to answer whether a species is present or absent from the mixture we study. As a secondary aim, we are also interested in estimating accurately the relative abundance of the present organisms. We can assess the abundance estimates produced by the methods by using error measures such as the relative root mean square error, RRMSE and the average relative error, AVGRE. For all methods, when the exact strain was not identified but the correct species or genus was, we used this abundance.', 'cite_spans': [], 'ref_spans': [], 'section': 'Relative abundances'}, {'text': 'where t j is the true abundance of species j and w j the estimated abundance. metaMix produces the most accurate abundance estimates and the results are summarized in Table 2 .', 'cite_spans': [], 'ref_spans': [{'start': 167, 'end': 174, 'text': 'Table 2', 'ref_id': None}], 'section': 'Relative abundances'}, {'text': 'We then assessed the importance of the read support parameter r on the output of metaMix. We ran metaMix on the benchmark simHC FAMeS dataset with r = {10, 20, 30, 50} reads (Table 3, Figure 1 ). We observe that as r decreases, a few more related strains from the reference database that are not in the community are retained in the output. As r increases two similar strains are merged into one. We compared these results with the output of Pathoscope and MEGAN. None of these methods have a read support parameter serving the same purpose as in metaMix, so we tuned the most relevant parameters in these tools. Pathoscope has a thetaPrior parameter that enforces a unique read penalty. This parameter represents the read pseudocounts for the non-unique matches and the default setting is zero which allows for non informative priors. Using the default setting Pathoscope identifies 47 taxa. When thetaP\\'s value is in (1,7) it identifies 22 taxa, while with thetaP> 7 it identifies 165. With this latter setting which is the one we chose for the comparison, Pathoscope behaves as a standard mixture model. MEGAN has a \"Min Support\" parameter which sets a threshold for the number of reads that must be assigned to a taxon so that it appears in the result. Any read assigned to a taxon not having the required support is pushed up the taxonomy until a taxon is found that has sufficient support. We used Min support = {10, 20, 30, 50} reads. The respective number of taxa in the summary files were 250, 243, 236, 232.', 'cite_spans': [], 'ref_spans': [{'start': 184, 'end': 192, 'text': 'Figure 1', 'ref_id': 'FIGREF2'}], 'section': 'Importance of read support parameter'}, {'text': \"We then also applied a post-run read count threshold to both methods' output summary. We set the threshold for 10,20,30,50 reads respectively, disregarding taxa that have less than that number of reads assigned to them. In all instances metaMix produces a community profile closer to the real one (Table 3, . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint Figure 1 ) Pathoscope finds ∼ 20 more false positives while MEGAN ∼ 40 more compared to metaMix at the same read support level.\", 'cite_spans': [], 'ref_spans': [{'start': 297, 'end': 306, 'text': '(Table 3,', 'ref_id': None}, {'start': 522, 'end': 530, 'text': 'Figure 1', 'ref_id': 'FIGREF2'}], 'section': 'Importance of read support parameter'}, {'text': 'The simHC from the FAMeS datasets is complex and distinct from typical human clinical samples, putting aside gut microbiome analysis. The differences are the large number of organisms, the presence of closely related strains of similar abundances, as well as the lack of viruses. Nevertheless, it is an essential dataset to use as a benchmark for examining the performance of the methods in a situation of closely related strains in the sample.', 'cite_spans': [], 'ref_spans': [], 'section': 'Importance of read support parameter'}, {'text': 'Protein reference database For the analysis of human clinical samples, we use a custom reference database that combineds viral, bacterial, human and mouse RefSeq proteins. All viruses are used (ftp://ftp.ncbi.nlm.nih.gov/refseq/release/ viral/viral.1.protein.faa.gz) as well as all the bacteria of the human microbiome, according to ftp: //ftp.ncbi.nih.gov/genomes/HUMAN_MICROBIOM/Bacteria/all.faa.tar.gz.', 'cite_spans': [], 'ref_spans': [], 'section': 'Human clinical sample -low viral load'}, {'text': 'To test metaMix in a clinical setting with a low viral load, we used a brain biopsy RNA-Seq dataset from an undiagnosed encephalitis patient (UCL Hospital, data provided as part of a collaboration with Professor Breuer, UCL). Total RNA was purified from the biopsy and polyA RNA was separated for sequencing library preparation. The Illumina MiSeq instrument generated 20 million paired-end reads. We processed the raw data using the bioinformatics pipeline described in Methods. The processed dataset consisted of ∼ 75, 000 non-host reads and contigs. Based on the BLASTx output there were 1,298 potential species.', 'cite_spans': [], 'ref_spans': [], 'section': 'Human clinical sample -low viral load'}, {'text': 'Following the initial processing, we used metaMix for species identification and abundance estimation. The resulting species profile is shown in Table 4 ; the 13 metaMix entries correspond to 10 species. The most abundant organism was the φX174 bacteriophage, which is routinely used for deep-sequencing quality control. More interestingly, we identified an astrovirus. Five short assembled contigs (44 reads) with length ranging between 167bp and 471bp and two non-assembled reads were assigned to the Astrovirus VA1 with a probability score of 1 (Figure 2) . metaMix also identified a number of bacteria supported by a few reads. These are either known laboratory reagent contaminants or human skin associated contaminants (Salter et al., 2014) . The analysis completed in 29 minutes.', 'cite_spans': [{'start': 725, 'end': 746, 'text': '(Salter et al., 2014)', 'ref_id': 'BIBREF23'}], 'ref_spans': [{'start': 145, 'end': 152, 'text': 'Table 4', 'ref_id': 'TABREF2'}, {'start': 548, 'end': 558, 'text': '(Figure 2)', 'ref_id': 'FIGREF3'}], 'section': 'metaMix'}, {'text': 'The presence of the astrovirus was confirmed using real-time RT-PCR. Genome sequencing of the astrovirus in the sample and subsequent study of the consensus sequence showed that we had in fact identified a novel virus, closely related to the VA1 strain ( (Brown et al., 2014) , in press).', 'cite_spans': [{'start': 255, 'end': 275, 'text': '(Brown et al., 2014)', 'ref_id': 'BIBREF3'}], 'ref_spans': [], 'section': 'metaMix'}, {'text': 'Pathoscope identified 22 taxa, corresponding to 15 species and some genera or families (Table 4) . It also assigned all 46 reads to the Astrovirus VA1. Almost all the species identified from metaMix were identified by Pathoscope, with an additional 9 taxa supported by few reads. As the method can only properly work with unassembled sequence data, an extra BLASTx similarity step had to be performed for the 91,516 reads that had contributed to the 679 assembled contigs. Pathoscope produced the results in less than one minute.', 'cite_spans': [], 'ref_spans': [{'start': 87, 'end': 96, 'text': '(Table 4)', 'ref_id': 'TABREF2'}], 'section': 'Pathoscope'}, {'text': 'MEGAN identified 19 taxa and did not detect the astrovirus signal. We modified the minimum read support parameter from 50 reads to 10 to increase sensitivity. MEGAN then identified 25 taxa, including the Astrovirus VA1. The remaining 24 were mostly genera, relevant to the species detected by metaMix and Pathoscope. MEGAN produced the results in less than one minute. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'MEGAN'}, {'text': 'Human clinical sample -species absent from the database We then compared the performance of the three methods in a scenario where sequences present in the sample are absent from our reference database. We analyzed a second brain biopsy sample from an undiagnosed patient. 32 million RNA-Seq reads were obtained using the HiSeq instrument. Following initial processing using our bionformatics pipeline, the dataset had 1,261,575 non host reads and contigs for subsequent analyses. There were 3,150 potential species based on the BLASTx output.', 'cite_spans': [], 'ref_spans': [], 'section': 'MEGAN'}, {'text': 'The resulting species profile consisted of 7 species (Supplementary Table S2 ). The most interesting finding was the identification of a human coronavirus. We found two different strains: Human coronavirus OC43 had almost a million reads assigned to it. Additionally there were 67K reads assigned to Human enteric coronavirus strain 4408. The presence of both viral strains in the results indicated that even though the virus in the sample was mostly similar to the OC43 strain, there were sequences sharing higher similarity to 4408 at some loci. This is highlighting how the database choice impacts the results: the RefSeq database we used has only one OC43 strain, while in GenBank there are several OC43 strains capturing the high mutation rates of the Human Coronavirus OC43.', 'cite_spans': [], 'ref_spans': [{'start': 68, 'end': 76, 'text': 'Table S2', 'ref_id': None}], 'section': 'metaMix'}, {'text': 'We followed up on the sequences assigned to the \"unknown category\", that is approximately 170K reads, looking for nucleotide similarity with NR-NT using BLASTn. Approximately half of the reads originated from an untranslated region of the Coronavirus genome, which is not captured by the protein reference database. The remaining reads matched confidently to either Danio rerio (zebrafish) sequences or Gallus gallus (chicken), two organisms whose proteins are not in the human microbiome reference we are using. The zebrafish and chicken matches were explained as barcode leakage resulting from multiplexing on the same flowcell zebrafish and chicken RNA-Seq libraries. metaMix appropriately assigned these reads to the \"unknown\" category, producing a clean probabilistic summary (Supplementary Table S2 ). The method ran in 4.7 hours.', 'cite_spans': [], 'ref_spans': [{'start': 796, 'end': 804, 'text': 'Table S2', 'ref_id': None}], 'section': 'metaMix'}, {'text': 'In this instance, the metaMix results emphasize the importance of being able to deal with missing reference sequences that do not have a closely related strain or species in the same database.', 'cite_spans': [], 'ref_spans': [], 'section': 'metaMix'}, {'text': 'Pathoscope identified 177 species in this sample (Supplementary Table S2 ). We optimized the value of the unique read penalty parameter and we achieved the best results with the thetaPrior parameter set within the range 10-100. With these settings, the method identified 52 species (Supplementary Table S2 ). Our assessment is that Pathoscope is confused by the lack of completeness of databases combined with the absence of an \"unknown\" category, which prevents it from dealing with these unassigned reads sensibly. Pathoscope completed its analysis in 10 minutes.', 'cite_spans': [], 'ref_spans': [{'start': 64, 'end': 72, 'text': 'Table S2', 'ref_id': None}, {'start': 282, 'end': 305, 'text': '(Supplementary Table S2', 'ref_id': None}], 'section': 'Pathoscope'}, {'text': 'MEGAN assigned the reads to 30 taxa. These included some species and genera but most were families (Supplementary Table S2 ). Approximately 250K reads could not be assigned to any taxonomic level. MEGAN run in 8 minutes.', 'cite_spans': [], 'ref_spans': [{'start': 114, 'end': 122, 'text': 'Table S2', 'ref_id': None}], 'section': 'MEGAN'}, {'text': 'Here, we present metaMix, a sensitive method for metagenomic species identification and abundance estimation. The method is implemented in an R package (http://cran.r-project.org/web/packages/metaMix). Using a Bayesian mixture model framework, we account for model uncertainty by performing model averaging and we resolve ambiguous assignments by considering all reads simultaneously. A key feature of the method is that it provides probabilities that answer pertinent biological questions, in particular the posterior probability for the presence of a species in the mixture. Additionally it accurately quantifies the relative proportions of the organisms. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint This general framework is designed to address interpretation issues associated with closely related strains in the sample, low abundance organisms and absence of genomes from the reference database. We show that metaMix outperforms other methods in the community profiling task, particularly when complex structures with closely related strains are studied. As a consequence, it also produces more accurate relative abundance estimates for the species in the mixture. The method can deal with either unassembled reads or assembled contigs or both, allowing for flexibility of choice for the bioinformatics preprocessing. In practice, the choice of bioinformatics processing prior to the application of our Bayesian mixture analysis must be optimized for each application, and our processing pipeline has been designed with viral sequence identification from transcriptome sequencing as a main goal. Nevertheless, as demonstrated by our analysis of the mock bacterial community dataset, the method can be applied in other contexts.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'The sensitivity and general applicability of metaMix comes at an increased computational cost, requiring access to a multi-core computer to run efficiently. For the datasets presented here, the computation time remained manageable and did not exceed a few hours, using 12 cores to run 12 parallel chains. Nevertheless, a limitation of metaMix is the increased processing time for very large datasets. Speed related improvements can be implemented in scenarios where the species ambiguity concerns only a small proportion of the read set. Reads with certain assignments can be flagged prior to the MCMC exploration of the state-space. Their assignment information can then be carried forward, thereby reducing the size of the similarity matrix used as input by the mixture model. Another area of possible improvement is MCMC convergence determination. The current version of metaMix produces log-likelihood traceplots allowing the user to visually inspect the MCMC convergence, however additional diagnostic criteria can be implemented in future versions. metaMix is most useful for complex datasets for which the interpretation is challenging. It has been mainly used as a clinical diagnostic tool, helping with the identification of the infecting pathogen while providing an accurate profile of the community in the sample.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'Prior to running the mixture model for metagenomic profiling, several steps are required to process the short read sequence data (Figure 3 ). The pipeline uses publicly available bioinformatics tools for each preprocessing step.', 'cite_spans': [], 'ref_spans': [{'start': 129, 'end': 138, 'text': '(Figure 3', 'ref_id': 'FIGREF4'}], 'section': 'Bioinformatics preprocessing'}, {'text': \"The first step is the removal of clonal reads using an in house C++ script. We then use PRINSEQ (Schmieder and Edwards, 2011) for read-based quality control, removing low quality and complexity reads and performing 3'end trimming. For metagenomic analysis of human samples, reads originating from the human host are not relevant for our research question. We therefore remove human host reads, using a twostep approach to limit computation time: initially a short read aligner (novoalign, www.novocraft.com), followed by BLASTn. The next step is only applicable when the focus is on virus discovery using transcriptome reads. We remove ribosomal RNA sequences, using BLASTn against the Silva rRNA database (http://www.arb-silva.de/).\", 'cite_spans': [{'start': 96, 'end': 125, 'text': '(Schmieder and Edwards, 2011)', 'ref_id': 'BIBREF24'}], 'ref_spans': [], 'section': 'Bioinformatics preprocessing'}, {'text': 'The remaining reads are assembled into contigs using the Velvet short read assembler (Zerbino and Birney, 2008) . For each contig we record the number of reads required for its assembly, using this information at the stage of species abundance estimation. A Velvet tuning parameter is the user defined k-mer length that specifies the extent of overlap required to assemble read pairs. Metagenomic assembly is not a straightforward task, as short k-mers work best with the low abundance organisms, while long k-mers with the highly abundance ones. The shorter the k-mer the greater the chance of spurious overlaps, hence we choose relatively high k-mer length, in order to avoid chimeric contigs.', 'cite_spans': [{'start': 85, 'end': 111, 'text': '(Zerbino and Birney, 2008)', 'ref_id': 'BIBREF28'}], 'ref_spans': [], 'section': 'Bioinformatics preprocessing'}, {'text': 'For each contig and unassembled read we record the potentially originating species, using the nucleotide to protein homology matching tool BLASTx. We use BLASTx due to the higher level of conservation expected at the protein level compared to nucleotides. This choice is guided by our focus in viral pathogens -viruses having high genetic diversity and divergence (Fancello et al., 2012) .', 'cite_spans': [{'start': 364, 'end': 387, 'text': '(Fancello et al., 2012)', 'ref_id': 'BIBREF8'}], 'ref_spans': [], 'section': 'Bioinformatics preprocessing'}, {'text': 'This step generates a sparse similarity matrix between the read sequences and the protein sequences, with species as columns, reads and contigs as rows.', 'cite_spans': [], 'ref_spans': [], 'section': 'Bioinformatics preprocessing'}, {'text': \"The statistical method described in the remainder of this section considers the competing models that could accommodate our observed data, that is the BLASTx results and compares them. The different models . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint represent different sets of species being present in the sample. The method works on two levels of inference: in the first instance we assume a set of species to be present in the sample and we estimate this model's parameters given the data. The other level of inference is the model comparison so as to assess the more plausible model. The process is iterated in order to explore the model state space.\", 'cite_spans': [], 'ref_spans': [], 'section': 'Bioinformatics preprocessing'}, {'text': 'Assuming a given set of K species from which the reads can originate, the metagenomic problem can be summarized as a mixture problem, for which the assignment of the sequencing reads to species is unknown and must be determined. The data consist of N sequencing reads X = (x 1 , . . . , x N ), and for a given read x i the likelihood is written as:', 'cite_spans': [], 'ref_spans': [], 'section': 'Model specification assuming a fixed set of species'}, {'text': 'where w = (w 1 , ..., w K ) represent the proportion of each of the K species in the mixture. These mixture weights are constrained such that 0 ≤ w j ≤ 1 and j w j = 1. In practice, we also add a category (species K + 1) which we refer to as the \"unknown\" category, and captures the fact that some reads cannot be assigned to any species.', 'cite_spans': [], 'ref_spans': [], 'section': 'Model specification assuming a fixed set of species'}, {'text': 'Additionally f j (x i ) = P (x i |x i from species j) = p ij is the probability of observing the read x i conditional on the assumption that it originated from species j. We model this probability using the number of mismatches m between the translated read sequence and the reference sequence and a Poisson distribution with parameter λ for that number of mismatches.', 'cite_spans': [], 'ref_spans': [], 'section': 'Model specification assuming a fixed set of species'}, {'text': 'where l g is the length of the reference genome, when short reads are matched to a nucleotide database. For nucleotide matching, l g has a large impact on the probability computation. However, when matching against protein databases, the more limited heterogeneity of protein lengths results in a much smaller impact of the length parameter. In addition, incomplete annotation can potentially make the inclusion of protein length problematic for the p ij computation. Consequently, for protein matched sequences, we simply defined our p ij as:', 'cite_spans': [], 'ref_spans': [], 'section': 'Model specification assuming a fixed set of species'}, {'text': 'Therefore for a given set of K species, the p ij probabilities are regarded as known and the mixture weights must be estimated. Combining the above we see that when we know the set of species K, the mixture distribution gives the probability of observing read x i : K j=1 w j p ij , namely equation (3). We therefore write the likelihood of the dataset X as a sum of K n terms:', 'cite_spans': [], 'ref_spans': [], 'section': 'Model specification assuming a fixed set of species'}, {'text': 'Assuming a fixed set of species, the posterior probability distribution of the weights w given the read data X is:', 'cite_spans': [], 'ref_spans': [], 'section': 'Estimation of mixture weights'}, {'text': 'A practical prior for the mixing parameters w is the Dirichlet distribution owing to its conjugate status to the multinomial distribution. Despite the use of conjugate priors, the probabilistic assignment of reads to species involves the expansion of the likelihood into K n terms which is computationally infeasible through direct computation. An efficient estimation can be performed by the introduction of unobserved latent variables that code for the read assignments. In this framework, either the Gibbs sampler (Marin et al., 2005) , a Monte Carlo Markov Chain technique, or the Expectation-Maximization (EM) (Dempster and . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint Laird, 1977) algorithm can be used to estimate the mixture weights w. EM returns a point estimate for w while the Gibbs sampler the distribution of w (Supplementary Text 1 for details of implementation). Both methods were implemented and provided comparable results.', 'cite_spans': [{'start': 517, 'end': 537, 'text': '(Marin et al., 2005)', 'ref_id': 'BIBREF15'}, {'start': 615, 'end': 628, 'text': '(Dempster and', 'ref_id': None}], 'ref_spans': [], 'section': 'Estimation of mixture weights'}, {'text': 'Each combination of species corresponds to a finite mixture model for which the marginal likelihood can be estimated. Marginal likelihood comparison has a central role in comparing different models {M 1 , . . . , M m }. To compute the marginal likelihood P (X|M k ) for the mixture model M k one has to average over the parameters with respect to the prior distribution π(θ k |M k ), where θ k are the model parameters:', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'The posterior probability of the model M k is:', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'where P (M k ) is the prior belief we hold for each model. The prior can be specified depending on the context but the basis of our interpretation is that parsimonious models with a limited number of species are more likely. Thus in this Bayesian framework our default prior uses a penalty limiting the number of species in the model.', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'We approximate this penalty factor based on a user-defined parameter r that represents the species read support required by the user to believe in the presence of this species. We compute the logarithmic penalty value as the log-likelihood difference between two models: one where all N reads belong to the \"unknown\" category and one where r reads have a perfect match to some unspecified species and the remaining N − r reads belong to the \"unknown\" category. In the nucleotide similarity situation, the p ij probabilities for the r reads originating from this unspecified species are approximated by 1/(median genome length in the reference database). This parameter essentially reflects how many reads are required to provide credible support that a species is present in the mixture and acts as a probabilistic threshold as opposed to a deterministic one applied on a ranked list.', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'From now on, when we refer to the marginal likelihood, we mean the marginal likelihood for a specific model and we forego conditioning on the model M k in the notation. Additionally, in our mixture model p ij are always regarded as known, therefore the model parameters θ k are the mixture weights w. Hence (8) becomes:', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'P (X) = w P (X|w)π(w)dw', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'Approximating the marginal likelihood is a task both difficult and time-consuming. We chose the Defensive Importance Sampling technique (Hesterberg, 1995) for the relative simple implementation compared to other approaches (Supplementary Text 1 for details of implementation). This is crucial as we perform this approximation numerous times, for every species combination we consider.', 'cite_spans': [{'start': 136, 'end': 154, 'text': '(Hesterberg, 1995)', 'ref_id': 'BIBREF10'}], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'However the goal of this work is to deliver results in a clinical setting within an actionable time-frame. We wish to speed up the computation without compromising the accuracy and the sensitivity of the results. For that reason, we use a point estimate of the marginal likelihood by means of the Expectation-Maximization (EM) algorithm. The different approaches were used on the benchmark dataset. The resulting taxonomic assignment as well as the species relative abundance estimates were similar between them, with the EM approach resulting in a 13-fold speed increase (Supplementary Text 1).', 'cite_spans': [], 'ref_spans': [], 'section': 'Marginal likelihood estimation'}, {'text': 'We use a Monte Carlo Markov Chain (MCMC) to explore the set of present species of size 2 S − 1, where S is the total number of potential species. In practice we observe that S can be greater than 1,000. The MCMC must explore the state-space in a clinically useful timespan. Therefore we reduce the size of the state-space, by decreasing the number of S species to the low hundreds. We achieve this by fitting a mixture model . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint with S categories, considering all potential species simultaneously. Post fitting, we retain only the species categories that are not empty, that is categories that have at least one read assigned to them.', 'cite_spans': [], 'ref_spans': [], 'section': 'Model comparison: exploring the set of present species'}, {'text': 'Let us assume that at step t, we deal with a set of species that corresponds to the mixture model M k . At the next step (t + 1), we either add or remove a species and the new set corresponds to the mixture model M l . The step proposing the model M l is accepted with probability:', 'cite_spans': [], 'ref_spans': [], 'section': 'Model comparison: exploring the set of present species'}, {'text': 'where q(M l → M k ) is the probability of transitioning from model M l to model M k . In other words, this is the probability of adding or removing the species to the M k set of species that took us to the M l set of species.', 'cite_spans': [], 'ref_spans': [], 'section': 'Model comparison: exploring the set of present species'}, {'text': \"If the step is accepted, then the chain moves to the new proposed state M l . Otherwise if not accepted, the chain's current state becomes the previous state of the chain, i.e the set of species remains unchanged. metaMix outputs log-likelihood traceplots so that the user can visually inspect the mixing and the convergence of the chain. The default setting is to discard the first 20% of the iterations as burn-in. We concentrate on the rest to study the distribution over the model choices and perform model averaging. We can then summarize appropriately the posterior distribution and answer the important questions of interest. Examples of such questions include: what species have probability p or greater being included in the set of present species? what is the probability of having the n specific closely related strains in the set of present species? Depending on the biological context, one may ask numerous similar or other case-specific questions.\", 'cite_spans': [], 'ref_spans': [], 'section': 'Model comparison: exploring the set of present species'}, {'text': 'We observed that simple MCMC does not efficiently explore the complex model state space, as evidenced by the poor mixing of the chain (Figure 4) .', 'cite_spans': [], 'ref_spans': [{'start': 134, 'end': 144, 'text': '(Figure 4)', 'ref_id': 'FIGREF5'}], 'section': 'Optimized implementation: parallel tempering'}, {'text': 'In order to overcome this and take advantage of parallel computing, we run multiple chains and allow exchange moves between them. This method is called parallel tempering MCMC (Earl and Deem, 2005) . Within the parallel setting, each chain simulates from the posterior distribution g(M )=P (M |X) raised to a temperature t (0, 1], where model M M represents a set of species being present. The different temperature levels result in tempered versions of the posterior distribution P (M k |X) t=1/T . When T = 1 the draws are from the posterior distribution. On the other hand, at higher temperatures the posterior spreads out its mass and becomes flatter. In practice that means that distributions at higher temperatures are easily sampled, improving the mixing. We are interesting in studying the original posterior distribution with T = 1.', 'cite_spans': [{'start': 176, 'end': 197, 'text': '(Earl and Deem, 2005)', 'ref_id': 'BIBREF7'}], 'ref_spans': [], 'section': 'Optimized implementation: parallel tempering'}, {'text': 'We implemented two types of moves. The first is the mutation step, which simply is the within chain move we described in the previous section. This is accepted with probability given by (11). The other is the exchange step, a between chains move. This Metropolis-Hastings move proposes to swap the value of two chains k and k + 1, adjacent in terms of T . Suppose that the values of the two chains are M k and M k+1 respectively, corresponding to two different sets of species. The move is accepted with probability (Jasra et al., 2007) :', 'cite_spans': [{'start': 516, 'end': 536, 'text': '(Jasra et al., 2007)', 'ref_id': 'BIBREF12'}], 'ref_spans': [], 'section': 'Optimized implementation: parallel tempering'}, {'text': 'Since g k (M k )=P (M k |X) T and g k+1 (M k+1 ) = P (M k+1 |X) T +1 , it follows that when M k+1 represents a set of species of higher probability than the one M k represents, the exchange move will always be accepted (Figure 4) .', 'cite_spans': [], 'ref_spans': [{'start': 219, 'end': 229, 'text': '(Figure 4)', 'ref_id': 'FIGREF5'}], 'section': 'Optimized implementation: parallel tempering'}, {'text': 'This allows moves between separate modes, ensuring a global exploration of the model state space. Eventually \"hot\" and \"cold\" chains will progress towards a global mode.', 'cite_spans': [], 'ref_spans': [], 'section': 'Optimized implementation: parallel tempering'}, {'text': 'the Annals of Human Genetics. This work is also supported by the National Institute for Health Research University College London Hospitals Biomedical Research Centre. For metaMix that is r={10, 20, 30, 50} reads, for Pathoscope thetaPrior> 7+ post-run threshold ={10, 20, 30, 50} reads, for MEGAN \"Min Support\" + post-run threshold ={10, 20, 30, 50} reads. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint Tables . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint Staphylococcus epidermidis --10 28211', 'cite_spans': [], 'ref_spans': [{'start': 598, 'end': 604, 'text': 'Tables', 'ref_id': None}], 'section': 'Optimized implementation: parallel tempering'}, {'text': 'Alphaproteobacteria --10 28037', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Streptococcus mitis --8 562', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Escherichia coli --8 509173', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Acinetobacter baumannii AYE --7 41297', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Sphingomonadaceae --6 40214', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Acinetobacter johnsonii --6 29391', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Gemella morbillorum --5 76122', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Alloprevotella tannerae --4 652103', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Rhodopseudomonas palustris DX-1 --2 268747', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}, {'text': 'Prochlorococcus phage P-SSM4 --2 . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/007476 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'Figure Legends'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': 'Basic local alignment search tool', 'authors': [{'first': 'S', 'middle': ['F'], 'last': 'Altschul', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Gish', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Miller', 'suffix': ''}, {'first': 'E', 'middle': ['W'], 'last': 'Myers', 'suffix': ''}, {'first': 'D', 'middle': ['J'], 'last': 'Lipman', 'suffix': ''}], 'year': 1990, 'venue': 'Journal of Molecular Biology', 'volume': '215', 'issn': '3', 'pages': '403--410', 'other_ids': {}}, 'BIBREF1': {'ref_id': 'b1', 'title': 'Next-generation sequencing technologies in diagnostic virology', 'authors': [{'first': 'L', 'middle': [], 'last': 'Barzon', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Lavezzo', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Costanzi', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Franchin', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Toppo', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Palù', 'suffix': ''}], 'year': 2013, 'venue': 'Journal of Clinical Virology', 'volume': '58', 'issn': '2', 'pages': '346--50', 'other_ids': {}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'Phymm and PhymmBL: metagenomic phylogenetic classification with interpolated Markov models', 'authors': [{'first': 'A', 'middle': [], 'last': 'Brady', 'suffix': ''}, {'first': 'S', 'middle': ['L'], 'last': 'Salzberg', 'suffix': ''}], 'year': 2009, 'venue': 'Nature Methods', 'volume': '6', 'issn': '9', 'pages': '673--679', 'other_ids': {}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Astrovirus VA1/HMO-C: an increasingly recognised neurotropic pathogen in immunocompromised patients', 'authors': [{'first': 'J', 'middle': ['R'], 'last': 'Brown', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Morfopoulou', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Hubb', 'suffix': ''}, {'first': 'W', 'middle': ['A'], 'last': 'Emmett', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Ip', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Shah', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Brooks', 'suffix': ''}, {'first': 'S', 'middle': ['M'], 'last': 'Paine', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Anderson', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Virasami', 'suffix': ''}], 'year': 2014, 'venue': 'Clinical Infectious Diseases', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF4': {'ref_id': 'b4', 'title': 'Viral pathogen discovery', 'authors': [{'first': 'C', 'middle': ['Y'], 'last': 'Chiu', 'suffix': ''}], 'year': 2013, 'venue': 'Current Opinion in Microbiology', 'volume': '16', 'issn': '4', 'pages': '468--78', 'other_ids': {}}, 'BIBREF5': {'ref_id': 'b5', 'title': 'Maximum likelihood from incomplete data via the EM algorithm', 'authors': [{'first': 'A', 'middle': [], 'last': 'Dempster', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Laird', 'suffix': ''}], 'year': 1977, 'venue': 'Journal of the Royal Statistical Society', 'volume': '39', 'issn': '1', 'pages': '1--38', 'other_ids': {}}, 'BIBREF6': {'ref_id': 'b6', 'title': 'Taxonomic binning of metagenome samples generated by nextgeneration sequencing technologies', 'authors': [{'first': 'J', 'middle': [], 'last': 'Dröge', 'suffix': ''}, {'first': 'A', 'middle': ['C'], 'last': 'Mchardy', 'suffix': ''}], 'year': 2012, 'venue': 'Briefings in Bioinformatics', 'volume': '13', 'issn': '6', 'pages': '646--55', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'Parallel tempering: theory, applications, and new perspectives', 'authors': [{'first': 'D', 'middle': ['J'], 'last': 'Earl', 'suffix': ''}, {'first': 'M', 'middle': ['W'], 'last': 'Deem', 'suffix': ''}], 'year': 2005, 'venue': 'Physical Chemistry Chemical Physics', 'volume': '7', 'issn': '23', 'pages': '3910--3916', 'other_ids': {}}, 'BIBREF8': {'ref_id': 'b8', 'title': 'Computational tools for viral metagenomics and their application in clinical research', 'authors': [{'first': 'L', 'middle': [], 'last': 'Fancello', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Raoult', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Desnues', 'suffix': ''}], 'year': 2012, 'venue': 'Virology', 'volume': '434', 'issn': '2', 'pages': '162--74', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'Pathoscope: species identification and strain attribution with unassembled sequencing data', 'authors': [{'first': 'O', 'middle': ['E'], 'last': 'Francis', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Bendall', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Manimaran', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Hong', 'suffix': ''}, {'first': 'N', 'middle': ['L'], 'last': 'Clement', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Castro-Nallar', 'suffix': ''}, {'first': 'Q', 'middle': [], 'last': 'Snell', 'suffix': ''}, {'first': 'G', 'middle': ['B'], 'last': 'Schaalje', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Clement', 'suffix': ''}, {'first': 'K', 'middle': ['A'], 'last': 'Crandall', 'suffix': ''}], 'year': 2013, 'venue': 'Genome Research', 'volume': '23', 'issn': '10', 'pages': '1721--1730', 'other_ids': {}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Weighted average importance sampling and defensive mixture distributions', 'authors': [{'first': 'T', 'middle': [], 'last': 'Hesterberg', 'suffix': ''}], 'year': 1995, 'venue': '', 'volume': '37', 'issn': '', 'pages': '185--194', 'other_ids': {}}, 'BIBREF11': {'ref_id': 'b11', 'title': 'MEGAN analysis of metagenomic data', 'authors': [{'first': 'D', 'middle': ['H'], 'last': 'Huson', 'suffix': ''}, {'first': 'A', 'middle': ['F'], 'last': 'Auch', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Qi', 'suffix': ''}, {'first': 'S', 'middle': ['C'], 'last': 'Schuster', 'suffix': ''}], 'year': 2007, 'venue': 'Genome Research', 'volume': '17', 'issn': '3', 'pages': '377--386', 'other_ids': {}}, 'BIBREF12': {'ref_id': 'b12', 'title': 'On population-based simulation for static inference', 'authors': [{'first': 'A', 'middle': [], 'last': 'Jasra', 'suffix': ''}, {'first': 'D', 'middle': ['A'], 'last': 'Stephens', 'suffix': ''}, {'first': 'C', 'middle': ['C'], 'last': 'Holmes', 'suffix': ''}], 'year': 2007, 'venue': 'Statistics and Computing', 'volume': '17', 'issn': '3', 'pages': '263--279', 'other_ids': {}}, 'BIBREF13': {'ref_id': 'b13', 'title': \"A bioinformatician's guide to metagenomics\", 'authors': [{'first': 'V', 'middle': [], 'last': 'Kunin', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Copeland', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Lapidus', 'suffix': ''}, {'first': 'K', 'middle': [], 'last': 'Mavromatis', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Hugenholtz', 'suffix': ''}], 'year': 2008, 'venue': 'Microbiology and Molecular Biology Reviews', 'volume': '72', 'issn': '4', 'pages': '557--78', 'other_ids': {}}, 'BIBREF14': {'ref_id': 'b14', 'title': 'Rapid identification of high-confidence taxonomic assignments for metagenomic data', 'authors': [{'first': 'N', 'middle': ['J'], 'last': 'Macdonald', 'suffix': ''}, {'first': 'D', 'middle': ['H'], 'last': 'Parks', 'suffix': ''}, {'first': 'R', 'middle': ['G'], 'last': 'Beiko', 'suffix': ''}], 'year': 2012, 'venue': 'Nucleic Acids Research', 'volume': '40', 'issn': '14', 'pages': '', 'other_ids': {}}, 'BIBREF15': {'ref_id': 'b15', 'title': 'Bayesian modelling and inference on mixtures of distributions. Handbook of statistics', 'authors': [{'first': 'J.-M', 'middle': [], 'last': 'Marin', 'suffix': ''}, {'first': 'K', 'middle': [], 'last': 'Mengersen', 'suffix': ''}, {'first': 'Robert', 'middle': [], 'last': '', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2005, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF16': {'ref_id': 'b16', 'title': 'Accurate phylogenetic classification of variable-length DNA fragments', 'authors': [{'first': 'A', 'middle': ['C'], 'last': 'Mchardy', 'suffix': ''}, {'first': 'H', 'middle': ['G'], 'last': 'Martín', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Tsirigos', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Hugenholtz', 'suffix': ''}, {'first': 'Rigoutsos', 'middle': [], 'last': '', 'suffix': ''}, {'first': 'I', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2007, 'venue': 'Nature Methods', 'volume': '4', 'issn': '1', 'pages': '63--72', 'other_ids': {}}, 'BIBREF17': {'ref_id': 'b17', 'title': 'A new phlebovirus associated with severe febrile illness in Missouri', 'authors': [{'first': 'L', 'middle': ['K'], 'last': 'Mcmullan', 'suffix': ''}, {'first': 'S', 'middle': ['M'], 'last': 'Folk', 'suffix': ''}, {'first': 'A', 'middle': ['J'], 'last': 'Kelly', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Macneil', 'suffix': ''}, {'first': 'C', 'middle': ['S'], 'last': 'Goldsmith', 'suffix': ''}, {'first': 'M', 'middle': ['G'], 'last': 'Metcalfe', 'suffix': ''}, {'first': 'B', 'middle': ['C'], 'last': 'Batten', 'suffix': ''}, {'first': 'C', 'middle': ['G'], 'last': 'Albariño', 'suffix': ''}, {'first': 'S', 'middle': ['R'], 'last': 'Zaki', 'suffix': ''}, {'first': 'P', 'middle': ['E'], 'last': 'Rollin', 'suffix': ''}], 'year': 2012, 'venue': 'The New England Journal of Medicine', 'volume': '367', 'issn': '9', 'pages': '834--875', 'other_ids': {}}, 'BIBREF18': {'ref_id': 'b18', 'title': 'The human gut virome: inter-individual variation and dynamic response to diet', 'authors': [{'first': 'S', 'middle': [], 'last': 'Minot', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Sinha', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'S', 'middle': ['A'], 'last': 'Keilbaugh', 'suffix': ''}, {'first': 'G', 'middle': ['D'], 'last': 'Wu', 'suffix': ''}, {'first': 'J', 'middle': ['D'], 'last': 'Lewis', 'suffix': ''}, {'first': 'F', 'middle': ['D'], 'last': 'Bushman', 'suffix': ''}], 'year': 2011, 'venue': 'Genome Research', 'volume': '21', 'issn': '10', 'pages': '1616--1641', 'other_ids': {}}, 'BIBREF19': {'ref_id': 'b19', 'title': 'Expanding the marine virosphere using metagenomics', 'authors': [{'first': 'C', 'middle': ['M'], 'last': 'Mizuno', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Rodriguez-Valera', 'suffix': ''}, {'first': 'N', 'middle': ['E'], 'last': 'Kimes', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Ghai', 'suffix': ''}], 'year': 2013, 'venue': 'PLoS Genetics', 'volume': '9', 'issn': '12', 'pages': '', 'other_ids': {}}, 'BIBREF20': {'ref_id': 'b20', 'title': 'Discovery of an ebolavirus-like filovirus in europe', 'authors': [{'first': 'A', 'middle': [], 'last': 'Negredo', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Palacios', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Vázquez-Morón', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'González', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Dopazo', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Molero', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Juste', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Quetglas', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Savji', 'suffix': ''}, {'first': '', 'middle': [], 'last': 'De La Cruz', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Martínez', 'suffix': ''}], 'year': 2011, 'venue': 'PLoS Pathogens', 'volume': '7', 'issn': '10', 'pages': '', 'other_ids': {}}, 'BIBREF21': {'ref_id': 'b21', 'title': 'A human gut microbial gene catalogue established by metagenomic sequencing', 'authors': [{'first': 'J', 'middle': [], 'last': 'Qin', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Raes', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Arumugam', 'suffix': ''}, {'first': 'K', 'middle': ['S'], 'last': 'Burgdorf', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Manichanh', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Nielsen', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Pons', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Levenez', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Yamada', 'suffix': ''}], 'year': 2010, 'venue': 'Nature', 'volume': '464', 'issn': '7285', 'pages': '59--65', 'other_ids': {}}, 'BIBREF22': {'ref_id': 'b22', 'title': 'A tale of three next generation sequencing platforms: comparison of Ion Torrent, Pacific Biosciences and Illumina MiSeq sequencers', 'authors': [{'first': 'M', 'middle': [], 'last': 'Quail', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Smith', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Coupland', 'suffix': ''}, {'first': 'T', 'middle': ['D'], 'last': 'Otto', 'suffix': ''}, {'first': 'S', 'middle': ['R'], 'last': 'Harris', 'suffix': ''}, {'first': 'T', 'middle': ['R'], 'last': 'Connor', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Bertoni', 'suffix': ''}, {'first': 'H', 'middle': ['P'], 'last': 'Swerdlow', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Gu', 'suffix': ''}], 'year': 2012, 'venue': 'BMC genomics', 'volume': '13', 'issn': '1', 'pages': '', 'other_ids': {}}, 'BIBREF23': {'ref_id': 'b23', 'title': 'Reagent contamination can critically impact sequence-based microbiome analyses', 'authors': [{'first': 'S', 'middle': [], 'last': 'Salter', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Cox', 'suffix': ''}, {'first': 'E', 'middle': ['M'], 'last': 'Turek', 'suffix': ''}, {'first': 'S', 'middle': ['T'], 'last': 'Calus', 'suffix': ''}, {'first': 'W', 'middle': ['O'], 'last': 'Cookson', 'suffix': ''}, {'first': 'M', 'middle': ['F'], 'last': 'Moffatt', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Turner', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Parkhill', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Loman', 'suffix': ''}, {'first': 'A', 'middle': ['W'], 'last': 'Walker', 'suffix': ''}], 'year': 2014, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1101/007187']}}, 'BIBREF24': {'ref_id': 'b24', 'title': 'Quality control and preprocessing of metagenomic datasets', 'authors': [{'first': 'R', 'middle': [], 'last': 'Schmieder', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Edwards', 'suffix': ''}], 'year': 2011, 'venue': 'Bioinformatics', 'volume': '27', 'issn': '6', 'pages': '863--867', 'other_ids': {}}, 'BIBREF25': {'ref_id': 'b25', 'title': 'Metagenomic analysis of respiratory tract DNA viral communities in cystic fibrosis and non-cystic fibrosis individuals', 'authors': [{'first': 'D', 'middle': [], 'last': 'Willner', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Furlan', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Haynes', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Schmieder', 'suffix': ''}, {'first': 'F', 'middle': ['E'], 'last': 'Angly', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Silva', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Tammadoni', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Nosrat', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Conrad', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Rohwer', 'suffix': ''}], 'year': 2009, 'venue': 'PLoS One', 'volume': '4', 'issn': '10', 'pages': '', 'other_ids': {}}, 'BIBREF26': {'ref_id': 'b26', 'title': 'Accurate genome relative abundance estimation based on shotgun metagenomic reads', 'authors': [{'first': 'L', 'middle': ['C'], 'last': 'Xia', 'suffix': ''}, {'first': 'J', 'middle': ['A'], 'last': 'Cram', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'J', 'middle': ['A'], 'last': 'Fuhrman', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Sun', 'suffix': ''}], 'year': 2011, 'venue': 'PLoS One', 'volume': '6', 'issn': '12', 'pages': '', 'other_ids': {}}, 'BIBREF27': {'ref_id': 'b27', 'title': 'Unsupervised binning of environmental genomic fragments based on an error robust selection of l-mers', 'authors': [{'first': 'B', 'middle': [], 'last': 'Yang', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Peng', 'suffix': ''}, {'first': 'H. C.-M', 'middle': [], 'last': 'Leung', 'suffix': ''}, {'first': 'S.-M', 'middle': [], 'last': 'Yiu', 'suffix': ''}, {'first': 'J.-C', 'middle': [], 'last': 'Chen', 'suffix': ''}, {'first': 'F', 'middle': ['Y'], 'last': 'Chin', 'suffix': ''}, {'first': '.-L', 'middle': [], 'last': '', 'suffix': ''}], 'year': 2010, 'venue': 'BMC bioinformatics', 'volume': '11', 'issn': '2', 'pages': '', 'other_ids': {}}, 'BIBREF28': {'ref_id': 'b28', 'title': 'Velvet: algorithms for de novo short read assembly using de Bruijn graphs', 'authors': [{'first': 'D', 'middle': ['R'], 'last': 'Zerbino', 'suffix': ''}, {'first': 'E', 'middle': [], 'last': 'Birney', 'suffix': ''}], 'year': 2008, 'venue': 'Genome Research', 'volume': '18', 'issn': '5', 'pages': '821--829', 'other_ids': {}}}, 'ref_entries': {'FIGREF2': {'text': 'Number of species in the simulated simHC FAMeS dataset detected by metaMix, Pathoscope and MEGAN, as a function of the minimum number of reads required for each species to appear in the output.', 'latex': None, 'type': 'figure'}, 'FIGREF3': {'text': 'Human clinical sample -novel virus. The reads (blue lines) assigned by metaMix to Astrovirus VA1, aligned to its genome. The purple lines represent the genes of the virus.', 'latex': None, 'type': 'figure'}, 'FIGREF4': {'text': 'Pipeline steps for species identification. The step of removing the rRNA sequences is applicable only when the aim is viral discovery.', 'latex': None, 'type': 'figure'}, 'FIGREF5': {'text': 'a. Log-likelihood trace plot for single chain MCMC and b. for PT chain at temperature T=1. c. Schematic of parallel tempering. Exchanges are attempted between chains of neighboring temperatures, where Chain1 at T 1 = 1, T 1 < T 2 < T 3 < T 4 .', 'latex': None, 'type': 'figure'}, 'FIGREF6': {'text': 'Figures', 'latex': None, 'type': 'figure'}, 'FIGREF8': {'text': 'Figure 3', 'latex': None, 'type': 'figure'}, 'FIGREF10': {'text': 'Figure 4', 'latex': None, 'type': 'figure'}, 'TABREF1': {'text': 'Number of species identified for the FAMeS simHC dataset. Measures of estimation accuracy of relative abundances for the FAMeS simHC dataset.Table 3. Number of species in the simHC FAMeS dataset by metaMix, Pathoscope and MEGAN', 'latex': None, 'type': 'table'}, 'TABREF2': {'text': 'Human clinical sample -novel virus. Comparison of community profile: metaMix -Pathoscope.', 'latex': None, 'type': 'table'}}, 'back_matter': [{'text': 'We thank David Balding for the constructive comments on the manuscript, Christian Robert for the helpful methodological discussion, Judith Breuer, Julianne Lockwood and Mike Hubank for data sharing and informative discussions for the interpretation of the results. SM is supported by a PhD studentship from', 'cite_spans': [], 'ref_spans': [], 'section': 'Acknowledgments'}]}\n", - "{'paper_id': 'eccef80cfbe078235df22398f195d5db462d8000', 'metadata': {'title': 'Mapping a viral phylogeny onto outbreak trees to improve host transmission inference', 'authors': [{'first': 'Jonathan', 'middle': ['E'], 'last': 'Allen', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Lawrence Livermore National Laboratory', 'location': {'postCode': '94551', 'settlement': 'Livermore', 'region': 'CA', 'country': 'USA'}}, 'email': ''}, {'first': 'Stephan', 'middle': ['P'], 'last': 'Velsko', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Lawrence Livermore National Laboratory', 'location': {'postCode': '94551', 'settlement': 'Livermore', 'region': 'CA', 'country': 'USA'}}, 'email': ''}, {'first': 'Stephan', 'middle': ['P'], 'last': 'Gov;', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Lawrence Livermore National Laboratory', 'location': {'postCode': '94551', 'settlement': 'Livermore', 'region': 'CA', 'country': 'USA'}}, 'email': ''}, {'first': 'Velsko -Velsko2@llnl', 'middle': [], 'last': 'Gov;', 'suffix': '', 'affiliation': {'laboratory': '', 'institution': 'Lawrence Livermore National Laboratory', 'location': {'postCode': '94551', 'settlement': 'Livermore', 'region': 'CA', 'country': 'USA'}}, 'email': ''}]}, 'abstract': [{'text': 'Background: Developing methods to reconstruct transmission histories for viral outbreaks could provide critical information to support locating sources of disease transmission. Phylogenetic methods used to measure the degree of relatedness among sequenced viral samples have proven useful in identifying potential outbreak sources.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}, {'text': 'The complex nature of infectious disease, however, makes it difficult to assign a rigorously defined quantitative confidence value assessing the likelihood of a true direct transmission event using genetic data alone.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}, {'text': \"Results: A new method is presented to calculate a confidence value assessing the likelihood of a transmission event using both phylogenetic inference and limited knowledge of incubation and infectious duration times. The method is applied to simulations of a foot and mouth disease (FMD) outbreak to demonstrate how the combination of both phylogenetic and epidemiology data can be used to strengthen the assessment of the likelihood of direct transmission over methods using just phylogenetic data or infection timing data alone. The method is applied to a previous FMD outbreak to identify areas where over confidence in previously inferred direct transmission may exist. Conclusion: Combining knowledge from viral evolution and epidemiology within a single integrated transmission inference framework is an important approach to assess the potential likelihood of transmission events and makes clear how specific features of a virus' spread through the course of an outbreak will directly determine the potential for confidence in inferred host transmission links. Background Developing methods to protect against RNA virus outbreaks in the face of constantly changing disease dynamics remains one of the great public health challenges of our time. One small piece of this problem involves determining the host-pathogen transmission history to identify an outbreak's origin. In recent cases such as the 2003 SARS outbreak [1, 2], H5N1 influenza [3] infection in humans and Marburgvirus [4], phylogenetic inference has been used to help identify potential outbreak sources. Typically, a collection of sequences from candidate related sources are compared to infer transmission links between the most closely related sequences. These methods have been used beyond the public health domain to provide evidence in HIV [5, 6] and other forensic investigations [7] to help infer when a suspect is involved in infecting a victim. P (δ, σ|M ) rather than P (δ|M ). This reflects the desire to estimate the probability of observing the genetic relation δ in conjunction with infection related timing values for the transmission hypothesis M . The σ parameter values are motivated by a SEIR type infection concept [22], which describes a host in one of four 5 . CC-BY-ND 4.0 International license is made available under a\", 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': ': Schematic of transmission links overlaid on phylogenetic trees T 0 (left tree) and T 1 (right tree). Transmission links are shown as directed edges connecting two infected hosts X 0 to X 1 and Y 0 to Y 1 respectively.', 'cite_spans': [], 'ref_spans': [], 'section': ''}, {'text': 'While phylogenetic data helps establish the evolutionary relationships among the sequenced viral samples, there are no well established tools to translate these results to explicitly infer host to host transmission links with confidence values that accurately capture the uncertainty of host transmission inference. The result is a sampling and transmission inference process that could lead to false confidence when using genetic data to establish host to host transmission [8, 9] .', 'cite_spans': [{'start': 475, 'end': 478, 'text': '[8,', 'ref_id': 'BIBREF7'}, {'start': 479, 'end': 481, 'text': '9]', 'ref_id': 'BIBREF8'}], 'ref_spans': [], 'section': ''}, {'text': 'Complementary to the viral genome sequencing advances is contact tracing work to find host to host transmission patterns independent of molecular genetic sequence data [10] [11] [12] [13] . Previously a transmission inference framework was described that estimates the probability of viral transmission between two hosts using both viral sequence samples and contact tracing data [14, 15] . The method uses labeled training sets from past outbreaks to correlate models of viral evolution associated with direct transmission and indirect transmission to assign a posterior probability that gives a confidence value for inferring a transmission relationship between two infected hosts. One of the limitations of the approach is the need for a robust training set, which can limit application to novel outbreaks. Even when training data is available, a training set over-represented with local outbreak features could skew confidence values. Thus, there is a need to develop additional tools to mitigate limits introduced from training. One feature that can be difficult to accurately account for with limited exemplars is time series data. The objective of this article is to present an approach to incorporate time series data with molecular genetic sequence data to measure the uncertainty of host to host transmission events in the face of limited training data and variability in host infection times. Efforts to integrate models of epidemiology and genetic evolution have been ongoing for some time [16] [17] [18] [19] , yet less work has focused on the specific problem of measuring the certainty of inferring individual host transmission events. One related effort is a method called SeqTrack, which was used to examine the origins of the 2009 H1N1 outbreak [20] . This work is similar to our previous transmission inference [14] approach with a few key differences. Rather than inferring a host transmission graph, a genealogy tree is inferred where nodes represent sequence samples and edges between nodes link a sequence to putative descendants. Sample collection times exclude some candidate edges and edge weights represent genetic distances between two sequences. The genealogy tree is inferred by finding the directed graph analog of the minimum spanning tree. Given the elapsed time between two sequences and a per day mutation rate, a confidence value is assigned for direct genealogy linkages. The framework presented here is a more general framework where variable within host evolution times are assumed to play a role. A second closely related effort is the work to recover transmission links between farms applied to the 2001 FMD outbreak [21] .', 'cite_spans': [{'start': 168, 'end': 172, 'text': '[10]', 'ref_id': 'BIBREF9'}, {'start': 173, 'end': 177, 'text': '[11]', 'ref_id': 'BIBREF10'}, {'start': 178, 'end': 182, 'text': '[12]', 'ref_id': 'BIBREF11'}, {'start': 183, 'end': 187, 'text': '[13]', 'ref_id': 'BIBREF12'}, {'start': 380, 'end': 384, 'text': '[14,', 'ref_id': 'BIBREF13'}, {'start': 385, 'end': 388, 'text': '15]', 'ref_id': 'BIBREF14'}, {'start': 1502, 'end': 1506, 'text': '[16]', 'ref_id': 'BIBREF15'}, {'start': 1507, 'end': 1511, 'text': '[17]', 'ref_id': 'BIBREF16'}, {'start': 1512, 'end': 1516, 'text': '[18]', 'ref_id': 'BIBREF17'}, {'start': 1517, 'end': 1521, 'text': '[19]', 'ref_id': 'BIBREF18'}, {'start': 1763, 'end': 1767, 'text': '[20]', 'ref_id': 'BIBREF19'}, {'start': 1830, 'end': 1834, 'text': '[14]', 'ref_id': 'BIBREF13'}, {'start': 2658, 'end': 2662, 'text': '[21]', 'ref_id': 'BIBREF20'}], 'ref_spans': [], 'section': ''}, {'text': \"FMD samples were collected and sequenced and an inferred phylogenetic tree was used in conjunction with farm infection times to evaluate the most likely transmission tree. Data from that study is used here and a comparison with this work will be covered in detail later in the paper. While the Cottam et al. approach evaluates a putative outbreak tree overlaid onto a single phylogenetic tree, our main contribution is to provide a framework that models features of transmission inference not previously considered. In particular, uncertainty in transmission inference is newly captured by explicitly considering unobserved nodes in the outbreak tree and uncertainty of phylogeny through the use of multiple potentially competing phylogenetic inferences. The result is a more realistic measure of uncertainty when assigning a transmission link. The framework's added use of competing transmission hypotheses does, however, come with an increased computational cost. The remainder of the paper details the modifications to our original framework, shows how new information is used in simulations and real data, details important differences with the Cottam et al. approach and introduces a way for dealing with limitations introduced by the framework's increased computational cost.\", 'cite_spans': [], 'ref_spans': [], 'section': ''}, {'text': 'The previous framework is briefly reviewed to motivate the notation used to incorporate outbreak timing data. For any two infected hosts A and B, distance δ = δ(A, B) defines a genetic distance between samples taken from the two hosts. In the simplest form one sequenced sample is available for each host and the genetic relationship is described by a Jukes-Cantor like distance measure (δ can take on more complex forms as will be described later). The variable M represents the number of edges (ignoring direction) linking two nodes in an outbreak tree where nodes represent infected entities and edges represent directed transmission events from a source to a recipient. For the direct transmission hypothesis M=1 the posterior probability is defined to be:', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'where P (δ|M > 1) = n M =2 P (δ|M )P (M ) for some maximum transmission chain length n. P (δ|M ) is estimated from labeled training data using density estimators or fitting the observed data to a distribution such as Poisson, Negative Binomial or Gamma.', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'The modified framework explicitly combines knowledge of viral infection times within the genetic based transmission inference framework. This is accomplished by adding a parameter σ and computing the value states 1) Susceptible (S) to infection 2) Exposed (E) to an infection (but not infectious) 3) Infectious (I)able to transmit virus to other hosts and 4) Recovered state (R) -where the host is no longer able to transmit the virus. The SEIR model describes outbreak dynamics in a homogeneous mixing population and since the focus of the framework is to predict individual host to host transmission relationships a pre-determined outbreak tree is used, which excludes hosts in the susceptible state. Figure 2 shows the graphical representation of two infected hosts A and B linked by a direct transmission event. Each circle represents a discrete time point marking the transition between states or time intervals. Transmission time points marked red in Figure 2 denote the time of transmission from Host A to Host B. Sample time points marked in purple represent times when viral samples are collected for sequencing. The representation is analogous to a phylogenetic tree where branches (or tree edges) represent time intervals and nodes represent fixed time points. Here edges or transmission events between infected nodes represent instantaneous time events and the host nodes represent time intervals defined by a linear chain of discrete events. The motivation for this representation is to employ a sufficiently realistic epidemiology model to describe the transmission relationships of interest and use existing methods for modeling viral evolution by mapping them to the epidemiology model.', 'cite_spans': [], 'ref_spans': [{'start': 703, 'end': 711, 'text': 'Figure 2', 'ref_id': 'FIGREF1'}, {'start': 957, 'end': 965, 'text': 'Figure 2', 'ref_id': 'FIGREF1'}], 'section': 'Algorithm'}, {'text': 'The modified framework reports the probability of jointly observing both the inferred genetic relationship between sequences from host A and B and the timing events associated with the viral infections of the two hosts. A key phylogenetic inferred parameter becomes the estimated \"wall clock\" time of the inferred most 6 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint recent common ancestor between the time stamped sequence samples taken from the two hosts and potentially using samples from the larger outbreak. Here the Bayesian estimation software BEAST [23] is used to estimate a series of candidate phylogenetic relationships and inferred times, weighted by a likelihood value for each estimate. Transmission hypothesis testing determines whether the constraints implied by the hypothesis are consistent with the inferred most recent common ancestor times. Next, a description for estimating the probability of direct transmission is given, which is then extended to evaluate any transmission relationship between two nodes in an outbreak tree.', 'cite_spans': [{'start': 723, 'end': 727, 'text': '[23]', 'ref_id': 'BIBREF22'}], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'Probability distributions describe the expected exposure time and the expected infectious time in the host.', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'An additional key requirement is the assignment of a fixed time point to each observed host that is relative to some global time clock. The point in time context can always be satisfied by assuming that the time is noted when a sample is collected from each infected individual. An alternative is to assume a fixed recovery time. For notational simplicity, a fixed recovery time is used although a comparable procedure is available for working from sample times as the reference. With host X recovery time R X , infectious time duration inf X occurring with probability P (inf X ) and exposure time exp X occurring with probability ', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'and P (E A |exp A , inf A , R A ) = P (exp A ) × P (inf A ) respectively. Now the genetic relation parameter δ is a set of estimates M RCAT i for 1 ≤ i ≤ q where q is a finite integer determined during an independent phylogenetic parameter estimation process. (For example, using Bayesian genetic inference software BEAST, a default value could be q = 10, 000 estimates.) The direct transmission hypothesis is tested by finding the infection time intervals that are consistent with the inferred most recent common ancestor Figure 3 : Example mapping of a two leaf phylogenetic tree onto the host node n (A or B). E n is the initial infection time, I n is initial infectious time, T A is transmission time from host A to host B. S n,1 is the sample collection time (for the first of potentially multiple samples) and R n is the initial recovery time. G − S A,1 and G − S B,1 are genetic sequences for the sampled isolates. MRCAT is an inferred time that occurs within the time interval: E A ≤ M RCAT < T A .', 'cite_spans': [], 'ref_spans': [{'start': 523, 'end': 531, 'text': 'Figure 3', 'ref_id': 'FIGREF3'}], 'section': 'Algorithm'}, {'text': 'times and is described as follows:', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'and', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'where Z is a normalizing constant that reflects the maximum probability value of the individual timing interval events independent of constraints.', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'To maintain consistency with previous descriptions of the original framework [14] , the transmission hypothesis M up to now was described by an integer value, which reports the number of edges in a tree separating the two nodes in question. This is sufficient for genetic only transmission inference when genetic distance is computed without regard to transmission direction. Thus, different sub tree configurations containing the same number of edges are treated identically without regard to transmission direction. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint edges, there are k distinct sub trees that must be evaluated.', 'cite_spans': [{'start': 77, 'end': 81, 'text': '[14]', 'ref_id': 'BIBREF13'}], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'The color coding of nodes in Figure 4 highlights the two distinct node types in the hypothesis testing framework. The hypothesis test considers an outbreak tree with \"observed\" nodes and \"hidden\" nodes. ', 'cite_spans': [], 'ref_spans': [{'start': 29, 'end': 37, 'text': 'Figure 4', 'ref_id': 'FIGREF4'}], 'section': 'Algorithm'}, {'text': 'and the calculation of P (σ, δ|t M ) invokes a recursive tree traversal procedure to check the possible transmission timing values that are consistent with the transmission hypothesis t M . Hidden nodes cannot be tied to a global reference time but rather occur within a time range that is relative to the reference time points specified by the observed nodes and satisfy the transmission and phylogenetic constraints. A tree traversal process sets the range of possible infection times using the reference time points for the observed nodes. One of two traversal procedures is invoked depending on the sub tree structure: directed linear chain, where an observed node A is the source and the last node in the transmission chain is the other observed node B (see for example the bottom panel in Figure 4 ). The alternative traversal process is invoked when the two observed nodes are linked by transmission from a common source as shown in Figure 9 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . 5. Figure 5 shows an example of the tree traversal process for one specific instance of t M where M = 3.', 'cite_spans': [], 'ref_spans': [{'start': 795, 'end': 803, 'text': 'Figure 4', 'ref_id': 'FIGREF4'}, {'start': 940, 'end': 950, 'text': 'Figure 9', 'ref_id': 'FIGREF17'}, {'start': 1113, 'end': 1121, 'text': 'Figure 5', 'ref_id': 'FIGREF7'}], 'section': 'Algorithm'}, {'text': \"The common source sub tree traversal algorithm starts with each observed node (leaf) independently and moves up the tree to the root. Each node n is assigned an earliest infection time E min,n and a latest infection time E max,n . The parent of node n is assigned an earliest and latest infection time for each minimum and maximum future initial infection time at node n. The total number of possible initial infection time ranges for the source node is 2 d /2 where d is the length of the chain from the source to the leaf. The set of initial timing event ranges derived from one observable node are compared with the set of infection time ranges derived from the other observable node to determine whether there is a candidate infection time interval that supports the tree's transmission hypothesis linking both observed nodes. \", 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'A similar procedure is applied for the linear transmission chain case. Here the algorithm starts with the observed source node and propagates candidate infection timing intervals forward to the next recipient node (rather than backwards to the parent node) until the final observable node is reached and the set of possible infection timing intervals is compared with terminal node timing to determine if a candidate 10 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint timing interval supports the proposed transmission chain hypothesis.', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'The key drawback to the method is computational cost. There are 2 d+1 timing assignments that must be ', 'cite_spans': [], 'ref_spans': [], 'section': 'Algorithm'}, {'text': 'To evaluate the potential strength of inference from combining sequence and timing data, a simulation was used to generate genetic sequences from pre-defined outbreak conditions. The simulation uses a generative stochastic process for P (σ, δ|t M =1 ) and takes the following parameters as input:', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': '• A tree of infected hosts and their respective transmission links.', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': '• Two probability density functions P inc and P inf determining the length of time intervals inc n and inf n respectively for each node n in a outbreak tree.', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': '• A transmission probability density function P trans determining the probability of transmitting the virus from host A to host B at time t within time interval inf A .', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': '• A sample probability density function P sample determining when a sequence sample is collected within time interval inf A for host A.', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': '• A per day mutation rate and an initial source index sequence.', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': \"A single run simulates an outbreak by propagating sequences across the pre-specified outbreak tree. The index case node A's initial infection time is set to 0 (e.g. E A = 0) and the initial infectious time I A and recovery time R A are determined by values drawn from the P inc and P inf distributions. Secondary infected host B's initial infection time E B is determined by drawing from the P trans distribution to fix the time within the infectious interval from I A to R A that the actual transmission event takes place. The index sequence evolves daily according to the per day mutation rate with distinct sequence stored for each host n for each day from E n to R n . When host A transmits the virus to host B at time t, the sequence in host A at time t becomes the seed sequence for host B. A single sequence is used to represent each host node, the chosen sampled sequence is determined by P sample , which gives a discrete time point within the infectious 11 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint time interval from I n to R n in host node n. Sampled sequences are assumed to be taken following presentation of symptoms in the infected individual. This simulation approach is similar to a related effort to use an epidemic simulation model to construct large scale seasonal outbreaks of measles with viral genome sequence data [24] . Simulations were used to evaluate different viral sampling strategies for their proficiency in reconstructing viral population histories. A key difference with the work presented here is the focus on recovering individual host transmission chains rather than homogeneous host population infection models [25] .\", 'cite_spans': [{'start': 1509, 'end': 1513, 'text': '[24]', 'ref_id': 'BIBREF23'}, {'start': 1820, 'end': 1824, 'text': '[25]', 'ref_id': 'BIBREF24'}], 'ref_spans': [], 'section': 'Testing'}, {'text': 'Using the simulated sampled sequences from host nodes, phylogenetic trees were generated with BEAST running 1,000,000 iterations with 100,000 initial runs discarded and evaluating 10,000 sampled trees. The HKY nucleotide substitution model was used with fixed variation, and a relaxed molecular clock [26] with exponential distribution and a fixed population size coalescent model. The genetic distance between two sequences was taken to be the sum of the absolute time difference between two leaves in the tree and the most recent common ancestor. The final distance value is the weighted average over all available inferred trees. The same set of most recent common ancestor times was used as input to the method for integrating time with genetic data. For the test results reported below, the probability distributions for the incubation time and infectious duration time were taken to be uniform across a pre-determined range described below.', 'cite_spans': [{'start': 301, 'end': 305, 'text': '[26]', 'ref_id': 'BIBREF25'}], 'ref_spans': [], 'section': 'Testing'}, {'text': 'The simulator reflects reported features from the 2001 UK FMD outbreak described in [21] . Infected hosts here do not reflect individual animals but rather infected farms. An average incubation time of 5 days is assumed with a variation between 2 and 14 days determined by a gamma distribution. The infectious time interval varied from 1 to 10 days drawn from a distribution model on previously described time intervals given by [21] . The objective was to evaluate the impact of combining the genetic data with timing information and apply the evaluation methods to the original 2001 UK FMD outbreak data to assess the confidence of the associated transmission links predicted using similar evidence but using the new inference approach. A per day mutation rate of 2.226 × 10 −5 is used following a previously published report [10] . Figure 6 shows the tree used for the simulation. The tree is based on a subset of the 2001 outbreak given in [21] . The nodes labeled with two character codes were added to the published tree to connect nodes thought to be linked indirectly such that the total time duration distribution generated from repeated The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint to the most probable collection of incubation times independent of the hypothesis. Here just a single set of incubation and infectious duration times are considered (the true time) and thus every pair of nodes with a timing pattern consistent with the predicted transmission event occurs with probability 1. Thus, the false positive rate reflects the fact that roughly 3% of node pairs that are not linked have a timing pattern consistent with the potential for direct transmission. These pairs likely refer to \"siblings\" or other nodes that are close by in the tree. This provides a baseline measure for quantifying the difficulty of the inference problem and shows that when exact timing knowledge is available the possible transmission inferences become highly constrained. The MRCAT-Only line shows the other end of the spectrum where there are no explicit timing constraints to prevent inference of direct transmission. The plot shows that a low confidence threshold is required to correctly identify all true direct transmission links, which shows that there is a wide overlap between the genetic distance range for both direct and indirectly linked hosts.', 'cite_spans': [{'start': 84, 'end': 88, 'text': '[21]', 'ref_id': 'BIBREF20'}, {'start': 429, 'end': 433, 'text': '[21]', 'ref_id': 'BIBREF20'}, {'start': 828, 'end': 832, 'text': '[10]', 'ref_id': 'BIBREF9'}, {'start': 944, 'end': 948, 'text': '[21]', 'ref_id': 'BIBREF20'}], 'ref_spans': [{'start': 835, 'end': 843, 'text': 'Figure 6', 'ref_id': 'FIGREF11'}], 'section': 'Testing'}, {'text': 'Combining the two approaches shows that the already low false positive rate from the highly constrained timing data can be reduced still further by adding the constraints introduced by the timing data inferred from the time stamped genetic sequence data.', 'cite_spans': [], 'ref_spans': [], 'section': 'Testing'}, {'text': '. CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint Figure 7 illustrates how the algorithm works but assumes unrealistically precise knowledge of incubation and infectious duration times. Figure 8 shows the same comparison, but reducing the knowledge of incubation and infectious times. There are two distinct parameter settings shown in the Figure 8 . One setting where every observed node is assumed to have an incubation time that could range from 2 to 9 days with equal probability and the estimated infectious duration for each node is based on the known window but assigned and uncertainty of plus or minus one day (Combined:2-9 in Figure 8 ). The second setting is identical except the incubation time is assumed to extend from 2 to 14 days (Combined:2-14 in Figure 8 ).', 'cite_spans': [], 'ref_spans': [{'start': 212, 'end': 220, 'text': 'Figure 7', 'ref_id': 'FIGREF12'}, {'start': 348, 'end': 356, 'text': 'Figure 8', 'ref_id': 'FIGREF14'}, {'start': 502, 'end': 510, 'text': 'Figure 8', 'ref_id': 'FIGREF14'}, {'start': 798, 'end': 806, 'text': 'Figure 8', 'ref_id': 'FIGREF14'}, {'start': 926, 'end': 934, 'text': 'Figure 8', 'ref_id': 'FIGREF14'}], 'section': '15'}, {'text': 'The Timing-Only plot uses the 2 to 9 day incubation window and shows that (without using the genetic data) the range of possible timing scenarios more than triples the number of node pairs that are falsely linked by direct transmission from 3% to 10%. Combining the data sources shows improved inference with The key here seems to be the uncertainty associated with assigning a time to the most recent common ancestor. Longer incubation times open more plausible phylogenetic scenarios where two hosts are falsely linked by direct transmission but are actually linked by a shared infectious source. One potential way to strengthen inference could be to collect more sequences per farm spread over multiple time points, which in some situations may reduce the range of most likely most recent common ancestor times. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint trees. The algorithm starts at the leaves of the parsimony tree and assigns the inferred common ancestor nodes to one of the farms (with a descendant leaf) as the putative source. All possible tree labellings are evaluated using an infection time likelihood score, which similar to our approach estimates likely infection times based on estimated incubation and infectious duration times. Several important differences between the two approaches emerge after examining the differences in output. One difference is that our method incorporates putative unobserved farms into transmission hypothesis tests, which help prevent over confidence in certain transmission inferences. As will be discussed in the next section, this can also form the basis for supporting a more flexible framework for evaluating more sparsely sampled outbreaks. A second difference is that multiple phylogenetic tree topologies are used in the form of multiple most recent The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint Figure 10 : Likelihood for different transmission hypothesis for C to P and O to P, with different transmission chain lengths M. tree nodes to phylogenetic tree nodes whereas our method assigns phylogenetic tree nodes to outbreak tree nodes, which represent variable length time. The significance of these differences is that the new framework can test many additional plausible transmission hypotheses, which would otherwise be ignored.', 'cite_spans': [], 'ref_spans': [{'start': 2060, 'end': 2069, 'text': 'Figure 10', 'ref_id': 'FIGREF0'}], 'section': '15'}, {'text': 'Despite the differences, there was largely strong consensus for predictions between the two methods with all of the farms linked with high confidence in [21] predicted in the new method except the K to B link (results not shown). The contiguous edges marked in black show transmission inferences that match [21] .', 'cite_spans': [{'start': 153, 'end': 157, 'text': '[21]', 'ref_id': 'BIBREF20'}, {'start': 307, 'end': 311, 'text': '[21]', 'ref_id': 'BIBREF20'}], 'ref_spans': [], 'section': '15'}, {'text': 'Linkage between K and B required increasing the infectious window beyond the initial reported estimate.', 'cite_spans': [], 'ref_spans': [], 'section': '15'}, {'text': 'To make a complete comparison the discrepancy is noted and the infectious window was increased to 4 days to accommodate the possibility of a transmission link from K to B. The two dashed edges show links made by the new inference tool not present in the most probable transmission tree reported in [21] . The two red edges show the direct transmission inferences in the most likely Cottam outbreak tree that are not present in the most probable tree reported here. Next the key differences in the two approaches are discussed with respect to differences in observed transmission inference. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint Figure 10 shows the likelihood scores for each of the individual transmission hypothesis linking C to P and O to P. The x-axis is labeled by the number of intermediate host nodes (+ 1). Adjacent unlabeled values have the same M value but a different sub tree structure. The figure shows that the direct hypothesis scores are nearly identical but the alternative hypothesis for M=2 shows that an indirect link for O to P is more plausible. Thus, the explicit scoring of alternative transmission hypotheses involving hypothetical infected farms highlight potential transmission inference ambiguity. In the case of the Cottam tree, the potential for an O to P link is not discussed, however, it appears that this possibility may be precluded due to the structure of the parsimony tree.', 'cite_spans': [{'start': 298, 'end': 302, 'text': '[21]', 'ref_id': 'BIBREF20'}], 'ref_spans': [{'start': 739, 'end': 748, 'text': 'Figure 10', 'ref_id': 'FIGREF0'}], 'section': '15'}, {'text': '• Use of multiple most recent common ancestor time estimates in addition to transmission time is included in the transmission likelihood calculation.', 'cite_spans': [], 'ref_spans': [], 'section': '15'}, {'text': 'For farm O, the current method more clearly favors B as the source over K. Figure 11 The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint Figure 12 : Portion of the maximum credibility tree. Phylogenetic tree leaves represent sequence taken from four farms: O, C, D, M and P. The putative transmission chain O to C to P to M is overlaid onto the phylogenetic tree, with each farm assigned to an ancestral node.', 'cite_spans': [], 'ref_spans': [{'start': 75, 'end': 84, 'text': 'Figure 11', 'ref_id': 'FIGREF0'}, {'start': 234, 'end': 243, 'text': 'Figure 12', 'ref_id': 'FIGREF0'}], 'section': '15'}, {'text': 'The red area exceeds the blue area indicating that the inferred MRCA times between B and O are more frequently consistent with the B to O direct transmission hypothesis than the K to O alternative. Similar to the previous case, it appears that the inferred tree used in Cottam may preclude the possibility of a K to B to O transmission path.', 'cite_spans': [], 'ref_spans': [], 'section': '15'}, {'text': '• Farm transmission is modeled to occur instantaneously (with respect to time) and the variable length evolution occurs within the host.', 'cite_spans': [], 'ref_spans': [], 'section': '15'}, {'text': 'The maximum credibility tree shown in Figure 12 indicates the limitation of using the inferred ancestral nodes as the basis for reconstruction of the outbreak tree. The transmission chain O to C to P to M includes a 0 length branch from C to P that suggests a transmission from farm C to farm P with no genetic variation in the virus, which would be rejected as unlikely given minimum incubation times and the expected mutation rate. In the new method, however, since farm infections are represented by time intervals, the initial infection time of farm C can be shown to occur higher up the tree between two phylogenetic tree nodes, which would imply some mutation to occur on farm C prior to transmission to farm P. Thus, mapping the inferred phylogeny onto the outbreak tree rather than assigning outbreak nodes to phylogenetic tree nodes allows for the consideration of transmission hypotheses that would otherwise be excluded.', 'cite_spans': [], 'ref_spans': [{'start': 38, 'end': 47, 'text': 'Figure 12', 'ref_id': 'FIGREF0'}], 'section': '15'}, {'text': 'A strength of the Cottam et al. approach is that the transmission tree is built using all the sequences collectively by iteratively labeling the internal tree nodes rather than treating pairs of sequences independently, thus some transmission hypotheses conceivably could be more readily rejected by looking at multiple farms simultaneously. The new method shows the use of multiple tree topologies in a way that is similar to other recent applications [27] and demonstrates a natural way to incorporate multiple tree topologies in a single sequence comparison procedure. One potential improvement to the Cottam et al.', 'cite_spans': [{'start': 453, 'end': 457, 'text': '[27]', 'ref_id': 'BIBREF26'}], 'ref_spans': [], 'section': '15'}, {'text': 'approach would be to include additional candidate tree topologies for evaluation and report likelihood 20 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint values that are averaged across multiple trees.', 'cite_spans': [], 'ref_spans': [], 'section': '15'}, {'text': 'While it may be possible to densely sample from an outbreak in select circumstances, in many cases only sparse sampling is possible and this is likely to be another limitation of the published Cottam et al.', 'cite_spans': [], 'ref_spans': [], 'section': 'Establishing transmission links in a sparsely sampled outbreak'}, {'text': 'approach, which used dense farm sampling. Integrating phylogenetic data with infection timing data to infer more distant transmission linkages, however, is expected to be considerably more difficult. Intuitively, distinguishing between a transmission chain of length 20 from a chain of length 25 is not likely given the range of possible infection times. This is not a limitation of the method but a limitation of exploitable transmission inference knowledge. Nevertheless, a potential contribution of the method presented here is that if it is possible to relax the requirements of an exhaustive search of infection times, the framework could be used to more precisely quantify the uncertainty associated with making inherently more difficult transmission inferences and thus shed light on the conditions where more distant transmission links can be made.', 'cite_spans': [], 'ref_spans': [], 'section': 'Establishing transmission links in a sparsely sampled outbreak'}, {'text': 'To investigate this problem we introduce two random sampling steps to the algorithm, which when not implemented would otherwise preclude consideration of longer transmission chains. The first sampling step is to draw k different timing samples from the underlying incubation and infectious time duration distributions P inc and P inf respectively. For the results described here k was set to 50. The second random sampling step is in the tree traversal procedure illustrated in Figure 5 , which is designed to determine whether there is a sequence of timing events for a particular transmission chain and a given collection of infection time durations that is consistent with the inferred most recent common ancestor time. Rather than an exhaustive search, a fixed number of random trials are taken. This approach is thus biased against inferring links requiring rarely occurring timing intervals that would be required to support the most recent common ancestor time. We note that additional sampling strategies could be implemented to make this approach more robust, however, the current modifications should be sufficient for evaluating the potential to apply this approach to larger sparsely sampled outbreaks.', 'cite_spans': [], 'ref_spans': [{'start': 478, 'end': 486, 'text': 'Figure 5', 'ref_id': 'FIGREF7'}], 'section': 'Establishing transmission links in a sparsely sampled outbreak'}, {'text': 'For evaluation a variation on the previously described FMD outbreak simulation is used. Rather than using the smaller fixed tree topology, a tree topology was generated at random starting with an index node and randomly generating child nodes using a branching factor of 1.5, a maximal outbreak duration of 110 days and a maximal node size of 2,000. 100 trees were generated and the 40 trials that generated trees with 2,000 nodes were selected for further analysis. For each tree a pair of nodes was chosen at random to 21 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint Figure 13 : An example maximum credibility tree for a sparsely sampled simulated FMD outbreak. Tree leaves are labeled with numbers corresponding to nodes taken from a 2,000 node simulated outbreak tree. Select node pairs with known transmission relationships were evaluated. The selected node pairs are labeled by their transmission links using blue edges to link pairs and the path length marked in blue. The two sub trees are connected by a common root (not shown for image compactness).', 'cite_spans': [], 'ref_spans': [{'start': 736, 'end': 745, 'text': 'Figure 13', 'ref_id': 'FIGREF0'}], 'section': 'Establishing transmission links in a sparsely sampled outbreak'}, {'text': 'represent each path length present in the tree starting from 1 up to the maximal observed path length. To limit computational costs, only odd path lengths were evaluated. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint Figure 16 show P (δ|M ) values averaged over the 40 random trials. For clarity the two figures are used to separate the distributions into two classes, those with peaks at M ≤5 and those that peak at M > 5. (Note that the maximal path length observed was 37, however, due to the 2,000 tree node limit, longer path lengths occur less frequently in the 40 trial experiment and thus path lengths with less than 40 observations are not shown.) The results show that the framework assigns a likelihood score that roughly reflects the true underlying path length. In particular, all path lengths less than 10 clearly peak at M=1 or M=5 in the figure. The results do suggest room for improvement, however, since path lengths for M=11-17 still show their maximum likelihood value at M=5. A possible reason for this is that for longer transmission chains the space of inconsistent infection times may expand at a rate that is not proportional to the increase in the number consistent infection times. Since the method uses a fixed random sample size this would favor shorter more constrained transmission hypotheses. Nevertheless, even where the likelihood value peaks at M=5, for longer transmission chains the likelihood values for M > 5 come increasingly closer to the peak until there is finally a peak shift at path length 19 from M=5 to M=10. It is interesting to note that as the path length increases, the peak likelihood value decreases quantifying the increasing ambiguity associated with making long transmission chain inferences. Despite this difficulty, the results indicate that in many cases it may be possible to assign a transmission relationship using a more inclusive class of transmission chain lengths. For example, nodes linked by a transmission path length less than 10 appear to be fairly distinct from those separated by path lengths greater than 20. These distinctions could prove to be of critical importance, particularly when evaluated in the context of an underlying contact network.', 'cite_spans': [], 'ref_spans': [{'start': 320, 'end': 329, 'text': 'Figure 16', 'ref_id': 'FIGREF0'}], 'section': 'Establishing transmission links in a sparsely sampled outbreak'}, {'text': \"It is important to note that while the presented method is designed to use prior knowledge of viral infection timing patterns, uniform probability distributions were used since an accurate estimate of the distribution is not likely in the general case. More generally since the method requires both viral evolution and epidemiology knowledge there are significant outstanding challenges not addressed in this report. The method makes the simplifying assumption of a neutral evolution model without recombination. This assumption may be appropriate for certain acute viral infection types where host immunity does not play a significant role, however, chronic infections with wider persistent circulation of multiple subtypes increase the chances to misinterpret the transmission process. Additional work is need to ensure that any violation 25 . CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint of assumptions of evolution are detected and (ideally) corrected. The model currently lacks inter-species evolution modeling and as sequencing technology advances there will be opportunities to better measure within host viral populations [28] and the affects on the genetic features of the recipient host's initial seeding population. A consequence of this limited knowledge is reliance on a potentially simplifying feature of viral population structure. When testing direct transmission hypotheses the most recent common ancestor between sequences sampled from the source and recipient are assumed to reside in the source host.\", 'cite_spans': [{'start': 1295, 'end': 1299, 'text': '[28]', 'ref_id': 'BIBREF27'}], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'However, with a sufficiently large and genetically diverse host population it is at least conceivable that the most recent common ancestor resides in an earlier predecessor node.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'With respect to epidemiology modeling, the paper assumes there are accurate values for P (M ), which describe the prior knowledge about the relative frequency of the different types of transmission chains.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'There has been considerable progress in this area in recent years linking contact networks with the more traditional stochastic compartmentalized transmission models to yield estimates [11, 29] that can be inferred using knowledge gleaned from previous outbreaks. Moreover, there are other potentially useful epidemiological models that could be substituted for the SEIR approach presented here [30] . Another important area of development is the use of sequence data and a coalescent model to estimate epidemiological parameters. Recent work demonstrates the potential to estimate parameters for a SIR model from genetic sequence data [31] , which could provide an important basis for specifying the epidemiological parameters for transmission inference, when limited non-genome sequence derived observation is available. A strength of the approach is to provide a framework that can readily make use of increasing amounts of knowledge for a particular outbreak. For example, awareness of infections may increase during the course of an outbreak and thus favor using two different priors for the infectious duration times, one at an early outbreak stage and another at a later stage of the outbreak. Other clinical features of infection can be taken into account either by changing prior likelihoods for alternatives in the underlying outbreak tree structure or by making adjustment to priors in the timing distributions. In some cases prior knowledge could be used to narrow the window of possible infection times and greatly strengthen the confidence of transmission inference. In other cases prior knowledge may end up increasing the window of possible infection times and thus highlight the inherent complexity of reconstructing transmission events.', 'cite_spans': [{'start': 185, 'end': 189, 'text': '[11,', 'ref_id': 'BIBREF10'}, {'start': 190, 'end': 193, 'text': '29]', 'ref_id': 'BIBREF28'}, {'start': 395, 'end': 399, 'text': '[30]', 'ref_id': 'BIBREF29'}, {'start': 636, 'end': 640, 'text': '[31]', 'ref_id': None}], 'ref_spans': [], 'section': 'Discussion'}, {'text': '. CC-BY-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/010389 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': '26'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': 'Molecular Evolution of the SARS Coronavirus During the Course of the SARS Epidemic in China', 'authors': [{'first': 'Tcsme', 'middle': [], 'last': 'Consortium', 'suffix': ''}], 'year': 2004, 'venue': 'Science', 'volume': '303', 'issn': '5664', 'pages': '1666--1669', 'other_ids': {}}, 'BIBREF1': {'ref_id': 'b1', 'title': 'Bats Are Natural Reservoirs of SARS-Like Coronaviruses', 'authors': [{'first': 'W', 'middle': [], 'last': 'Li', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Shi', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Yu', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Ren', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Smith', 'suffix': ''}, {'first': 'J', 'middle': ['H'], 'last': 'Epstein', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Crameri', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Hu', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Mceachern', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Field', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Daszak', 'suffix': ''}, {'first': 'B', 'middle': ['T'], 'last': 'Eaton', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'L', 'middle': ['F'], 'last': 'Wang', 'suffix': ''}], 'year': 2005, 'venue': 'Science', 'volume': '310', 'issn': '5748', 'pages': '676--679', 'other_ids': {}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'Characterization of an Avian Influenza A (H5N1) Virus Isolated from a Child with a Fatal Respiratory Illness', 'authors': [{'first': 'K', 'middle': [], 'last': 'Subbarao', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Klimov', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Katz', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Regnery', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Lim', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Hall', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Perdue', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Swayne', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Bender', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Huang', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Hemphill', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Rowe', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Shaw', 'suffix': ''}, {'first': 'X', 'middle': [], 'last': 'Xu', 'suffix': ''}, {'first': 'K', 'middle': [], 'last': 'Fukuda', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Cox', 'suffix': ''}], 'year': 1998, 'venue': 'Science', 'volume': '279', 'issn': '5349', 'pages': '393--396', 'other_ids': {}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Marburgvirus Genomics and Association with a Large Hemorrhagic Fever Outbreak in Angola', 'authors': [{'first': 'J', 'middle': ['S'], 'last': 'Towner', 'suffix': ''}, {'first': 'M', 'middle': ['L'], 'last': 'Khristova', 'suffix': ''}, {'first': 'T', 'middle': ['K'], 'last': 'Sealy', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Vincent', 'suffix': ''}, {'first': 'B', 'middle': ['R'], 'last': 'Erickson', 'suffix': ''}, {'first': 'D', 'middle': ['A'], 'last': 'Bawiec', 'suffix': ''}, {'first': 'A', 'middle': ['L'], 'last': 'Hartman', 'suffix': ''}, {'first': 'J', 'middle': ['A'], 'last': 'Comer', 'suffix': ''}, {'first': 'S', 'middle': ['R'], 'last': 'Zaki', 'suffix': ''}, {'first': 'U', 'middle': [], 'last': 'Stroher', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Gomes Da Silva', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Castillo', 'suffix': ''}, {'first': 'P', 'middle': ['E'], 'last': 'Rollin', 'suffix': ''}, {'first': 'T', 'middle': ['G'], 'last': 'Ksiazek', 'suffix': ''}, {'first': 'Nichol', 'middle': [], 'last': 'St', 'suffix': ''}], 'year': 2006, 'venue': 'J. Virol', 'volume': '80', 'issn': '13', 'pages': '6497--6516', 'other_ids': {}}, 'BIBREF4': {'ref_id': 'b4', 'title': 'Molecular Epidemiology of HIV Transmission in a Dental Practice', 'authors': [{'first': 'C', 'middle': ['Y'], 'last': 'Ou', 'suffix': ''}, {'first': 'C', 'middle': ['A'], 'last': 'Ciesielski', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Myers', 'suffix': ''}, {'first': 'C', 'middle': ['I'], 'last': 'Bandea', 'suffix': ''}, {'first': 'C', 'middle': ['C'], 'last': 'Luo', 'suffix': ''}, {'first': 'Btm', 'middle': [], 'last': 'Korber', 'suffix': ''}, {'first': 'J', 'middle': ['I'], 'last': 'Mullins', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Schochetman', 'suffix': ''}, {'first': 'R', 'middle': ['L'], 'last': 'Berkelman', 'suffix': ''}, {'first': 'A', 'middle': ['N'], 'last': 'Economou', 'suffix': ''}, {'first': 'J', 'middle': ['J'], 'last': 'Witte', 'suffix': ''}, {'first': 'L', 'middle': ['J'], 'last': 'Furman', 'suffix': ''}, {'first': 'G', 'middle': ['A'], 'last': 'Satten', 'suffix': ''}, {'first': 'K', 'middle': ['A'], 'last': 'Maclnnes', 'suffix': ''}, {'first': 'J', 'middle': ['W'], 'last': 'Curran', 'suffix': ''}, {'first': 'H', 'middle': ['W'], 'last': 'Jaffe', 'suffix': ''}, {'first': 'L', 'middle': ['I'], 'last': 'Group', 'suffix': ''}, {'first': 'E', 'middle': ['I'], 'last': 'Group', 'suffix': ''}], 'year': 1992, 'venue': 'Science', 'volume': '256', 'issn': '5060', 'pages': '1165--1171', 'other_ids': {}}, 'BIBREF5': {'ref_id': 'b5', 'title': 'Molecular evidence of HIV-1 transmission in a criminal case', 'authors': [{'first': 'M', 'middle': ['L'], 'last': 'Metzker', 'suffix': ''}, {'first': 'D', 'middle': ['P'], 'last': 'Mindell', 'suffix': ''}, {'first': 'X', 'middle': ['M'], 'last': 'Liu', 'suffix': ''}, {'first': 'R', 'middle': ['G'], 'last': 'Ptak', 'suffix': ''}, {'first': 'R', 'middle': ['A'], 'last': 'Gibbs', 'suffix': ''}, {'first': 'D', 'middle': ['M'], 'last': 'Hillis', 'suffix': ''}], 'year': 2002, 'venue': 'Proceedings of the National Academy of Sciences of the United States of America', 'volume': '99', 'issn': '', 'pages': '14292--14297', 'other_ids': {}}, 'BIBREF6': {'ref_id': 'b6', 'title': 'The first case of criminalization of transmission of hepatitis B in the UK: defendant sentenced to two years imprisonment on the grounds of hepatitis B deoxyribonucleic acid sequencing', 'authors': [{'first': 'K', 'middle': [], 'last': 'Mohanty', 'suffix': ''}], 'year': 2009, 'venue': 'Int J STD AIDS', 'volume': '20', 'issn': '8', 'pages': '587--589', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'The microbial forensic use of HIV sequences', 'authors': [{'first': 'G', 'middle': ['H'], 'last': 'Learn', 'suffix': ''}, {'first': 'J', 'middle': ['I'], 'last': 'Mullins', 'suffix': ''}], 'year': 2003, 'venue': 'HIV Sequence Compendium', 'volume': '', 'issn': '', 'pages': '22--37', 'other_ids': {}}, 'BIBREF8': {'ref_id': 'b8', 'title': 'Reconstruction of HIV-1 transmission chains for forensic purposes', 'authors': [{'first': 'T', 'middle': [], 'last': 'Leitner', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Albert', 'suffix': ''}], 'year': 2000, 'venue': 'AIDS Rev', 'volume': '2', 'issn': '', 'pages': '241--251', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'Transmission Dynamics of the Etiological Agent of SARS in Hong Kong: Impact of Public Health Interventions', 'authors': [{'first': 'S', 'middle': [], 'last': 'Riley', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Fraser', 'suffix': ''}, {'first': 'C', 'middle': ['A'], 'last': 'Donnelly', 'suffix': ''}, {'first': 'A', 'middle': ['C'], 'last': 'Ghani', 'suffix': ''}, {'first': 'L', 'middle': ['J'], 'last': 'Abu-Raddad', 'suffix': ''}, {'first': 'A', 'middle': ['J'], 'last': 'Hedley', 'suffix': ''}, {'first': 'G', 'middle': ['M'], 'last': 'Leung', 'suffix': ''}, {'first': 'L', 'middle': ['M'], 'last': 'Ho', 'suffix': ''}, {'first': 'T', 'middle': ['H'], 'last': 'Lam', 'suffix': ''}, {'first': 'T', 'middle': ['Q'], 'last': 'Thach', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Chau', 'suffix': ''}, {'first': 'K', 'middle': ['P'], 'last': 'Chan', 'suffix': ''}, {'first': 'S', 'middle': ['V'], 'last': 'Lo', 'suffix': ''}, {'first': 'P', 'middle': ['Y'], 'last': 'Leung', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Tsang', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Ho', 'suffix': ''}, {'first': 'K', 'middle': ['H'], 'last': 'Lee', 'suffix': ''}, {'first': 'Emc', 'middle': [], 'last': 'Lau', 'suffix': ''}, {'first': 'N', 'middle': ['M'], 'last': 'Ferguson', 'suffix': ''}, {'first': 'R', 'middle': ['M'], 'last': 'Anderson', 'suffix': ''}], 'year': 2003, 'venue': 'Science', 'volume': '300', 'issn': '', 'pages': '1961--1966', 'other_ids': {}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Disease contact tracing in random and clustered networks', 'authors': [{'first': 'I', 'middle': ['Z'], 'last': 'Kiss', 'suffix': ''}, {'first': 'D', 'middle': ['M'], 'last': 'Green', 'suffix': ''}, {'first': 'R', 'middle': ['R'], 'last': 'Kao', 'suffix': ''}], 'year': 2005, 'venue': 'Proc R Soc B', 'volume': '272', 'issn': '', 'pages': '1407--1414', 'other_ids': {}}, 'BIBREF11': {'ref_id': 'b11', 'title': 'Epidemiology and Control of SARS in Singapore', 'authors': [{'first': 'K', 'middle': ['T'], 'last': 'Goh', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Cutter', 'suffix': ''}, {'first': 'B', 'middle': ['H'], 'last': 'Heng', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Ma', 'suffix': ''}, {'first': 'B', 'middle': ['K'], 'last': 'Koh', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Kwok', 'suffix': ''}, {'first': 'C', 'middle': ['M'], 'last': 'Toh', 'suffix': ''}, {'first': 'S', 'middle': ['K'], 'last': 'Chew', 'suffix': ''}], 'year': 2006, 'venue': 'Ann Acad Med', 'volume': '35', 'issn': '', 'pages': '301--316', 'other_ids': {}}, 'BIBREF12': {'ref_id': 'b12', 'title': 'When individual behaviour matters: homogenous and network models in epidemiology', 'authors': [{'first': 'S', 'middle': [], 'last': 'Bansal', 'suffix': ''}, {'first': 'B', 'middle': ['T'], 'last': 'Grenfell', 'suffix': ''}, {'first': 'L', 'middle': ['A'], 'last': 'Meyers', 'suffix': ''}], 'year': 2007, 'venue': 'J R Soc Interface', 'volume': '4', 'issn': '', 'pages': '879--891', 'other_ids': {}}, 'BIBREF13': {'ref_id': 'b13', 'title': 'A Statistical Framework for Microbial Source Attribution Part 1: Forensic inferences on disease transmission networks', 'authors': [{'first': 'S', 'middle': ['P'], 'last': 'Velsko', 'suffix': ''}, {'first': 'J', 'middle': ['E'], 'last': 'Allen', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Cunningham', 'suffix': ''}], 'year': 2009, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF14': {'ref_id': 'b14', 'title': 'A Statistical Framework for Microbial Source Attribution: Measuring uncertainty in host transmission events inferred from genetic data', 'authors': [{'first': 'J', 'middle': ['E'], 'last': 'Allen', 'suffix': ''}, {'first': 'S', 'middle': ['P'], 'last': 'Velsko', 'suffix': ''}], 'year': 2009, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF15': {'ref_id': 'b15', 'title': 'Unifying the Epidemiological and Evolutionary Dynamics of Pathogens', 'authors': [{'first': 'B', 'middle': ['T'], 'last': 'Grenfell', 'suffix': ''}, {'first': 'O', 'middle': ['G'], 'last': 'Pybus', 'suffix': ''}, {'first': 'J', 'middle': ['R'], 'last': 'Gog', 'suffix': ''}, {'first': 'Jln', 'middle': [], 'last': 'Wood', 'suffix': ''}, {'first': 'J', 'middle': ['M'], 'last': 'Daly', 'suffix': ''}, {'first': 'J', 'middle': ['A'], 'last': 'Mumford', 'suffix': ''}, {'first': 'E', 'middle': ['C'], 'last': 'Holmes', 'suffix': ''}], 'year': 2004, 'venue': 'Science', 'volume': '303', 'issn': '', 'pages': '327--332', 'other_ids': {}}, 'BIBREF16': {'ref_id': 'b16', 'title': 'Infecting epidemiology with genetics: a new frontier in disease ecology', 'authors': [{'first': 'E', 'middle': ['A'], 'last': 'Archie', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Luikart', 'suffix': ''}, {'first': 'Ezenwa', 'middle': [], 'last': 'Vo', 'suffix': ''}], 'year': 2008, 'venue': 'Trends in Ecology an dEvolution', 'volume': '24', 'issn': '', 'pages': '21--30', 'other_ids': {}}, 'BIBREF17': {'ref_id': 'b17', 'title': 'Evolutionary analysis of the dynamics of disease', 'authors': [{'first': 'O', 'middle': ['G'], 'last': 'Pybus', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Rambaut', 'suffix': ''}], 'year': 2009, 'venue': 'Nature Rev Genetics', 'volume': '10', 'issn': '', 'pages': '540--549', 'other_ids': {}}, 'BIBREF18': {'ref_id': 'b18', 'title': \"Viral phylodynamics and the search for an 'effective number of infections\", 'authors': [{'first': 'Sdw', 'middle': [], 'last': 'Frost', 'suffix': ''}, {'first': 'E', 'middle': ['M'], 'last': 'Volz', 'suffix': ''}], 'year': None, 'venue': 'Philosophical Transactions of the Royal Society B: Biological Sciences', 'volume': '', 'issn': '', 'pages': '1879--1890', 'other_ids': {}}, 'BIBREF19': {'ref_id': 'b19', 'title': 'Reconstructing disease outbreaks from genetic data: a graph approach', 'authors': [{'first': 'T', 'middle': [], 'last': 'Jombart', 'suffix': ''}, {'first': 'R', 'middle': ['M'], 'last': 'Eggo', 'suffix': ''}, {'first': 'P', 'middle': ['J'], 'last': 'Dodd', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Balloux', 'suffix': ''}], 'year': 2011, 'venue': 'Heredity', 'volume': '', 'issn': '', 'pages': '383--390', 'other_ids': {}}, 'BIBREF20': {'ref_id': 'b20', 'title': 'Integrating genetic and epidemiological data to determine transmission pathways of foot-and-mouth disease virus', 'authors': [{'first': 'E', 'middle': ['M'], 'last': 'Cottam', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Thebaud', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Wadsworth', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Gloster', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Mansley', 'suffix': ''}, {'first': 'D', 'middle': ['J'], 'last': 'Paton', 'suffix': ''}, {'first': 'D', 'middle': ['P'], 'last': 'King', 'suffix': ''}, {'first': 'D', 'middle': ['T'], 'last': 'Haydon', 'suffix': ''}], 'year': 2008, 'venue': 'Proc. R. Soc. B', 'volume': '275', 'issn': '', 'pages': '887--895', 'other_ids': {}}, 'BIBREF21': {'ref_id': 'b21', 'title': 'Dynamiucal behavior of epidemiological models with nonlinear incidence rates', 'authors': [{'first': 'W', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'H', 'middle': ['W'], 'last': 'Hethcote', 'suffix': ''}, {'first': 'S', 'middle': ['A'], 'last': 'Levin', 'suffix': ''}], 'year': 1987, 'venue': 'J of Mathematical Biology', 'volume': '25', 'issn': '', 'pages': '359--380', 'other_ids': {}}, 'BIBREF22': {'ref_id': 'b22', 'title': 'BEAST: Bayesian evolutionary analysis by sampling trees', 'authors': [{'first': 'A', 'middle': ['J'], 'last': 'Drummond', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Rambaut', 'suffix': ''}], 'year': 2007, 'venue': 'BMC Evol Biol', 'volume': '7', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF23': {'ref_id': 'b23', 'title': 'Protocols for sampling viral sequences to study epidemic dynamics', 'authors': [{'first': 'J', 'middle': ['C'], 'last': 'Stack', 'suffix': ''}, {'first': 'J', 'middle': ['D'], 'last': 'Welch', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Ferrari', 'suffix': ''}, {'first': 'B', 'middle': ['U'], 'last': 'Shapiro', 'suffix': ''}, {'first': 'B', 'middle': ['T'], 'last': 'Grenfell', 'suffix': ''}], 'year': 2010, 'venue': 'J R Soc Interface', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF24': {'ref_id': 'b24', 'title': 'Dynamics of measles epidemics: estimating scaling of transmission rates using a time series SIR model', 'authors': [{'first': 'O', 'middle': ['N'], 'last': 'Bjørnstad', 'suffix': ''}, {'first': 'B', 'middle': ['F'], 'last': 'Finglenstädt', 'suffix': ''}, {'first': 'B', 'middle': ['T'], 'last': 'Grenfell', 'suffix': ''}], 'year': 2002, 'venue': 'Ecological monographs', 'volume': '72', 'issn': '', 'pages': '169--184', 'other_ids': {}}, 'BIBREF25': {'ref_id': 'b25', 'title': 'Relaxed Phylogenetics and Dating with Confidence', 'authors': [{'first': 'A', 'middle': ['J'], 'last': 'Drummond', 'suffix': ''}, {'first': 'Syw', 'middle': [], 'last': 'Ho', 'suffix': ''}, {'first': 'M', 'middle': ['J'], 'last': 'Phillips', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Rambaut', 'suffix': ''}], 'year': 2006, 'venue': 'PLoS Biol', 'volume': '4', 'issn': '5', 'pages': '', 'other_ids': {}}, 'BIBREF26': {'ref_id': 'b26', 'title': 'Uncovering Genomic Reassortments among Influenza Strains by Enumerating Maximal Bicliques', 'authors': [{'first': 'N', 'middle': [], 'last': 'Nagarajan', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Kingsford', 'suffix': ''}], 'year': 2008, 'venue': 'IEEE International Conference on Bioinformatics and Biomedicine', 'volume': '', 'issn': '', 'pages': '223--230', 'other_ids': {}}, 'BIBREF27': {'ref_id': 'b27', 'title': 'Viral Population Estimation Using Pyrosequencing', 'authors': [{'first': 'N', 'middle': [], 'last': 'Eriksson', 'suffix': ''}, {'first': 'L', 'middle': [], 'last': 'Pachter', 'suffix': ''}, {'first': 'Y', 'middle': [], 'last': 'Mitsuya', 'suffix': ''}, {'first': 'S', 'middle': ['Y'], 'last': 'Rhee', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Wang', 'suffix': ''}, {'first': 'B', 'middle': [], 'last': 'Gharizadeh', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Ronaghi', 'suffix': ''}, {'first': 'R', 'middle': ['W'], 'last': 'Shafer', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Beerenwinkel', 'suffix': ''}], 'year': 2008, 'venue': 'PLoS Comput Biol', 'volume': '4', 'issn': '5', 'pages': '', 'other_ids': {}}, 'BIBREF28': {'ref_id': 'b28', 'title': 'Infectious disease control using contact tracing in random and scale-free networks', 'authors': [{'first': 'I', 'middle': ['Z'], 'last': 'Kiss', 'suffix': ''}, {'first': 'D', 'middle': ['M'], 'last': 'Green', 'suffix': ''}, {'first': 'R', 'middle': ['R'], 'last': 'Kao', 'suffix': ''}], 'year': 2005, 'venue': 'J R Soc Interface', 'volume': '3', 'issn': '', 'pages': '55--62', 'other_ids': {}}, 'BIBREF29': {'ref_id': 'b29', 'title': 'Mathematical models of infectious disease transmission', 'authors': [{'first': 'N', 'middle': ['C'], 'last': 'Grassly', 'suffix': ''}, {'first': 'C', 'middle': [], 'last': 'Fraser', 'suffix': ''}], 'year': 2008, 'venue': 'Nature Reviews Microbiolgy', 'volume': '6', 'issn': '', 'pages': '477--487', 'other_ids': {}}}, 'ref_entries': {'FIGREF0': {'text': 'shows a simple schematic to illustrate the variable host infection time transmission inference problem. The figure shows two simple trees, T 0 and T 1 (left and right tree respectively) where the infected hosts are overlaid as circles on the trees to show the transmission event from host X 0 to X 1 in T 0 and Y 0 to Y 1 in T 1 . The larger circle for Y 1 represents a longer infection time and the two trees represent the evolutionary relationship between the sequences. Assuming comparable homogeneous nucleotide substitution rates, the longer branch length in T 1 could be explained by the longer infection time in host Y 1 prior to sampling. Including the direct transmission branch lengths from T 0 and T 1 in a single genetic based transmission inference model leads to a distribution of genetic distances associated with direct transmission for a wider range of values. Thus, estimating accurate confidence values means ensuring that a representative sampling from the true distribution of infection times can be obtained. To measure the impact of this problem, a new method is introduced to incorporate variation in infection times when assigning a final posterior probability confidence value. The proposed method forgoes the use of training data by relying on prior knowledge of viral infection times to act as a constraint in conjunction with phylogenetic data. The results show that with knowledge of infection times confidence in transmission inference can be high and even with limited knowledge of timing patterns, high confidence values can be obtained that exceed the training based methods. Thus, this new transmission inference approach allows for the use of clinical observations of infection to be combined with inferred viral evolution when training data is not available to measure uncertainty associated with specific transmission hypotheses.', 'latex': None, 'type': 'figure'}, 'FIGREF1': {'text': 'Host infection time model. The small circle nodes denote discrete time events and edges denote time intervals. The large blue circles denote a single infected host.', 'latex': None, 'type': 'figure'}, 'FIGREF2': {'text': '(exp X ), the direct transmission hypothesis that host A transmitted the virus to host B is calculated by evaluating all pairwise combinations of exp x and inf x for each host. Candidate transmission events are marked by cases where the infectious time interval for host A overlaps with a potential initial infection time for host B and the inferred most recent common ancestor time for the sequence samples occurs after the initial infection time of A and prior to the initial infection time of B.', 'latex': None, 'type': 'figure'}, 'FIGREF3': {'text': 'shows the notation and the constraints for the most recent common ancestor time (MRCAT). Theinitial infection time E B of host B is E B = R B − inf B − exp B and similarly the initial infection time E A for host A is E A = R A − inf A − exp A ,which leads to the constraints defined by the specific transmission hypothesis. The transmission constraint C 1 (E B , R A , I A ) returns 1 if I A ≤ E B < R A and 0 otherwise. The most recent common ancestor constraint C 2 (M RCAT, E B , E A ) returns 1 if E A ≤ M RCAT < E B and 0 otherwise. The probability of', 'latex': None, 'type': 'figure'}, 'FIGREF4': {'text': 'shows how the time series process of outbreak events distinguishes transmission direction. For a sub tree connecting two nodes with k edges, there are k distinct sub trees with each sub tree representing a distinct potential source node for the infection. For example, the transmission sub tree with two edges separating host A and B has two possible sub trees to describe the transmission relationship between the two hosts, one where the source node is A and one sub tree where A and B receive the infection through direct transmission from a common source. In general for a transmission hypothesis linking two nodes by', 'latex': None, 'type': 'figure'}, 'FIGREF5': {'text': 'Transmission chains. The nodes labeled \"s\" represent the putative infection source. Blue nodes represent the two hosts (A and B) with sequenced samples. Green nodes reflect putative infected (hidden) intermediate hosts. Examples of transmission chains from length 1 (the direct transmission case) to 4 are shown.', 'latex': None, 'type': 'figure'}, 'FIGREF6': {'text': 'The hidden nodes are marked green in the figure and represent infected individuals where no direct observable data is available. The hidden nodes explicitly model the probability of the alternative transmission hypothesis linking two observed nodes A and B. To evaluate the M > 1 case an evaluation procedure for each sub tree t M ∈ T M (A, B) is used. Thus, P (σ, δ|M > 1) becomes the value of the most probable alternative hypothesis:', 'latex': None, 'type': 'figure'}, 'FIGREF7': {'text': 'Tree traversal procedure (right panel) for the transmission hypothesis shown in the left panel. Left panel shows a transmission hypothesis linking two observed infected nodes A and B (marked in blue), connected by a common unobserved labeled source node marked 0 and two intermediate source nodes marked 1 and 2. The right panel shows the post order tree traversal, where the observed time stamp for each observed node serves as the starting point and the possible infection times for each source node are computed by moving up the tree.', 'latex': None, 'type': 'figure'}, 'FIGREF8': {'text': 'The consistency function g(t M , IN C, IN F, M RCAT ) checks the sub tree t M assuming each node n is assigned a fixed incubation time interval inc n ∈ IN C and a fixed infectious time interval inf n ∈ IN F and returns 1 if the constraints are met and 0 otherwise. The final probability calculation is the set of timing intervals that maximize the product of the individual timing probabilities over all nodes:', 'latex': None, 'type': 'figure'}, 'FIGREF9': {'text': 'considered for each combination of timing intervals. If the number of possible distinct values for inc n and inf n is x the number of timing interval combinations becomes x M +1 . Thus, there is a practical limit to the maximal value for M and x. Measuring the potential linkages for distantly related host pairs can be determined by viewing the estimated probability values of linkage up to the maximum M and when all probability values are low, M can be inferred to be too small to allow transmission inference.', 'latex': None, 'type': 'figure'}, 'FIGREF10': {'text': 'simulations covered the same time frame of 110 days reported for the outbreak. Results are based on 50 independent outbreak simulations. In the genetic distance only based framework labeled training data is used to estimate the likelihood of observing a genetic distance under the different possible transmission hypotheses. In the modified framework, a prior belief of how the virus spreads replaces the evidence', 'latex': None, 'type': 'figure'}, 'FIGREF11': {'text': 'Tree used to generate simulated sequences and reflect known transmission links. Two character codes reflect hypothetical farms included in the tree and edges represent transmission relationships between infected farms.', 'latex': None, 'type': 'figure'}, 'FIGREF12': {'text': 'Performance with knowledge of timing. ROC curves showing the ability to correctly infer direct transmission cases (Sensitivity) versus falsely labeling indirect transmission pairs as direct transmission cases (False Positive Rate) using different confidence cut off values. Three methods are shown, using exact knowledge of timing data alone (Timing-Only), using only genetic data (MRCAT-Only) or combining the two and assuming exact knowledge of infection times.from a training set. To evaluate the two approaches, 5 fold cross-validation takes the 50 randomly generated outbreaks and partitions the outbreaks into 5 non overlapping sets of 10 distinct outbreaks to evaluate predicted transmission link accuracy using the remaining 40 outbreaks for training. Evaluations are repeated 5 times to report performance for all 50 outbreaks. When training data is not needed, statistics are simply collected on all 50 outbreaks in one step. Three basic prediction methods are compared -using genetic data only, using the epidemiology timing data only and using both lines of evidence together to inform the final prediction.', 'latex': None, 'type': 'figure'}, 'FIGREF13': {'text': 'shows a ROC curve for the rate of true direct transmission links identified (y-axis) versus the rate of host nodes falsely predicted to be linked (x-axis) using different prediction cutoff values. First consider prediction accuracy with near perfect knowledge of the timing events for the observed host nodes shown by the Timing-Only plot inFigure 7. The linear plot reflects the fact that the method generates a single prediction value so that above a certain cutoff either all of the true direct transmission links are predicted at the expense of falsely predicting transmission links for 3% of the other non-linked host node pairs or using a lower cutoff yields no predicted links. Recall that the calculation is meant to represent the most probable collection of incubation and infectious times consistent with the transmission hypothesis', 'latex': None, 'type': 'figure'}, 'FIGREF14': {'text': 'Limited timing knowledge. ROC curves showing the ability to correctly infer direct transmission cases (Sensitivity) versus falsely labeling indirect transmission pairs as direct transmission cases (False Positive Rate) using different confidence cut off values. Four methods are shown, using only timing data (Timing-Only), using only genetic data (MRCAT-Only) or combining the two assuming either a 2-14 day incubation period (Combined:2-14) or a 2-9 day incubation period (Combined:2-9).', 'latex': None, 'type': 'figure'}, 'FIGREF15': {'text': 'consistently higher sensitivity levels and less false positives. The results also show there are clear performance trade offs illustrating the underlying difficulty of the problem. On the one hand capping the maximal incubation time at 9 days greatly constrains the number of plausible transmission hypotheses, thus limiting the number of false positives. The trade off is that for the transmission cases where the incubation time actually exceeds 9 days the true positives can be missed, thus there is a hard upper bound on the sensitivity rate that is less than 1. Further relaxing restrictions to allow for the longer incubation times of up to 14 days allows detection of all true positives, but the number of false positives increases too.', 'latex': None, 'type': 'figure'}, 'FIGREF16': {'text': 'A second experiment was conducted to re-examine the transmission relationships reported by Cottom et al.on samples taken from a subset of farms infected during the 2001 United Kingdom outbreak. Although the true outbreak tree cannot be empirically determined, re-examination of this dataset proved useful in determining whether our new transmission inference method using similar information as input would lead to similar conclusions. Moreover, differences in output should highlight important differences in the two approaches and help assess the strengths and weaknesses of the different approaches.', 'latex': None, 'type': 'figure'}, 'FIGREF17': {'text': 'shows the higher confidence inferred transmission links for the published 2001 outbreak data with whole viral genome sequence data available for each farm. An incubation window of 2 to 14 days and an estimated infectious time interval with a 1 day uncertainty was used to find the most likely', 'latex': None, 'type': 'figure'}, 'FIGREF18': {'text': 'A most likely inferred transmission tree with links comparing the tree to previous reports. Red lines show novel links and dashed lines show previously reported alternative links. Each node represents an infected farm. chain. Cottom et al. similarly apply a transmission inference algorithm to the data using a closely related approach that takes both estimated infection times and genetic sequence data as input. Cottom et al. use a statistical parsimony tree relating the collection of sequences as the basis for building putative transmission', 'latex': None, 'type': 'figure'}, 'FIGREF19': {'text': 'common ancestor time estimates, which better reflect the uncertainty of phylogenetic inference. Finally, a key difference is in the information representation format. The Cottom et al. approach assigns', 'latex': None, 'type': 'figure'}, 'FIGREF20': {'text': 'Transmission hypotheses include the potential for intermediate unobserved farm involvement to estimate confidence values.', 'latex': None, 'type': 'figure'}, 'FIGREF21': {'text': 'Inferred most recent common ancestor times. Range of inferred most recent common ancestor times (MRCAT) between the B,O sequence pairs (red) and the K,O sequence pairs. Shaded area under the two curves show the time intervals consistent with the respective transmission hypotheses. For farm P, the new method only slightly favors an alternative source -C over O (0.57 versus 0.59) making it impossible to preclude either farm as the likely source. The favoring of C over O can be explained in part by differences in the alternative hypothesis likelihood scores.', 'latex': None, 'type': 'figure'}, 'FIGREF22': {'text': 'shows weighted probability density estimates for the most recent common ancestor times for B and O (red) and K and O (blue). The red and blue shaded area under the curves show the range of predicted initial infection times consistent with the MRCAT and the direct transmission hypothesis for B to O and K to O respectively.', 'latex': None, 'type': 'figure'}, 'FIGREF23': {'text': 'shows an example consensus tree for the selected pairs taken from an outbreak chosen at random. The directed edges labeled in blue show the path lengths connecting the selected nodes with the path length labels from 1 to the maximum observed (23) for this particular outbreak tree. The tree infers evolutionary relationships that are generally consistent with what would be expected given the different transmission chain lengths observed. For example, a path length of 23 separates node 628 from 1740 and indeed the two nodes end up in two different subclades of the tree. The key question becomes can additional information be obtained by incorporating knowledge of infection times and alternative transmission hypothesis testing with the phylogenetic data to better capture the range of possible relationships among the infected farms. To answer this, we first show the likelihood evaluation for the node pairs taken from the tree in Figure 13 and then show the same likelihood evaluations averaged across all Likelihood values are given for the 12 selected transmission pairs shown in Figure 13. Each node pair is evaluated for different values of M from 1 to 35 and the true path length for each node pair is shown in the legend. randomized outbreak trials. To limit computational costs, evaluated path lengths start with 1 and 5 and continue in intervals of 5 up to 35. The likelihood value P (δ|M ) is reported as the maximal likelihood value for all alternative sub trees of path length M. Figure 14 shows the value of P (δ|M ) for each pair of nodes with node pairs labeled by their true transmission path length. For the sampled outbreak shown in Figure 13, with the exception of M=11, node pairs for path lengths up to 15 show P (δ|M ) values that favor a specific hypothesis or a small subset of hypotheses. As the true path length increases, the average likelihood value drops and the distinction between different path lengths becomes less clear. For the node pair separated by path length of 15 for example, there are clearly higher values for inferring a path length from between 5 and 20, but there is limited distinction within this range. For the most part the framework largely reinforces what is visualized using the consensus phylogenetic tree. An interesting case, however, is the node pair (1255, 1888) linked by a transmission chain of 9, compared with node pair (713, 1119)separated by chain length 13. In the framework, both pairs of nodes have likelihood values that peak at M=5, however the alternative transmission hypotheses for linking (713, 1119) with M=10 or M=15 are much higher than they are for the (1255, 1881) case. This is an example where the relationships inferred by the tree may be misconstrued without the explicit use of a transmission inference framework. It is important to note that the transmission framework does not resolve all conflicts. For the M=11 case', 'latex': None, 'type': 'figure'}, 'FIGREF24': {'text': 'Likelihood values for all node pairs for a select path length averaged across all 40 trials. Plot shows path lengths where the average likelihood value peaks at M=5 or less. node pair (138, 184), the nodes show up in two different subclades suggesting a more distant transmission relationship, than might be expected by the arrangement of other pairs in the same subclade and separated by longer transmission chains. In this case, the transmission inference framework also suggests a more distant transmission link, given the relatively uniform distribution of low P (δ|M ) values.', 'latex': None, 'type': 'figure'}, 'FIGREF25': {'text': 'Figure 15 and Figure 16 show P (δ|M ) values averaged over the 40 random trials. For clarity the two', 'latex': None, 'type': 'figure'}, 'FIGREF26': {'text': 'Likelihood values for node pairs connected by longer path lengths. Plot is similar toFigure 15but shows path lengths where the average likelihood value peaks at M > 5.', 'latex': None, 'type': 'figure'}, 'FIGREF27': {'text': \"ConclusionA new method is presented that takes as input a collection of time-stamped viral genomes and basic knowledge about the timing features of the infection, the underlying transmission network and returns confidence values that assess the probability of correctly inferring host transmission links. Simulations of an FMD outbreak show how the combined use of the epidemiology based timing constraints with the phylogenetic inferred most recent common ancestor times improve confidence in transmission inference over methods that rely on just one individual feature. The simulations show that labeled training data measuring genetic distance between direct and indirect transmission cases can be used without the additional knowledge of the features of the viral transmission, however, appropriate training data in some cases may not be available. The approach presented is expected to provide a basis for building flexible tools to more readily map inferred viral evolutionary relationships onto epidemiological models to more accurately quantify host transmission relationships. Authors' contributions JEA and SPV conceived and designed methods. JEA implemented experiments and drafted the manuscript. JEA and SPV analyzed results.\", 'latex': None, 'type': 'figure'}}, 'back_matter': [{'text': 'The authors acknowledge funding for this work from the U.S. Department of Homeland Security. This work was performed under the auspices of the U.S. Department of Energy by Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.', 'cite_spans': [], 'ref_spans': [], 'section': 'Acknowledgments'}]}\n" - ] + "data": { + "text/plain": [ + "1934" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "for i in range(5):\n", - " print(article_array[i])" + "all_json = glob.glob(f'{biorxiv_medrxiv_path}/**/*.json', recursive=True)\n", + "len(all_json)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])" + "'D:/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv\\\\pdf_json\\\\0015023cc06b5362d332b3baf348d11567ca2fbb.json'" ] }, - "execution_count": 21, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "article_array[0].keys()" + "all_json[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "From above, we can see that there are keys that describe each paper of the biorxiv journal. Of these, Our paritcular interest lies in the abstract and the body_text. The title is useful too, and to get it, we have to look inside the metadata part of the file. " + "Let's make a class so that we could refer to the loaded articles easily" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "class Article:\n", + " \n", + " def __init__(self,filepath):\n", + " with open(filepath) as f:\n", + " metadata = json.load(f)\n", + " self.paper_id = metadata['paper_id']\n", + " self.title = metadata['metadata']['title']\n", + " self.abstract = \"\"\n", + " self.body_text = \"\"\n", + " \n", + " #Abstract\n", + " for entry in metadata['abstract']:\n", + " self.abstract += str(entry['text'])\n", + " \n", + "\n", + " #body_text\n", + " for entry in metadata['body_text']:\n", + " self.body_text += str(entry['text'])\n", + "\n", + "\n", + " \n", + " def __repr__(self):\n", + " return f'Article Object with id: {self.paper_id} \\n title:{self.title} \\n abstract:{self.abstract}'\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "articles = {}\n", + "for i in range(5):\n", + " articles[i] = Article(all_json[i])" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['title', 'authors'])" + "'word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a single large open reading 28 frame flanked by 5′ and 3′ untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 has an unusually large 5′ UTR (1.3 kb) containing five structural domains. These include the 30 internal ribosome entry site (IRES), which facilitates initiation of translation, and the cis-acting 31 replication element (cre). Less well characterised structures are a 5′ terminal 360 nucleotide 32 stem-loop, a variable length poly-C-tract of approximately 100-200 nucleotides and a series of 33 two to four tandemly repeated pseudoknots (PKs). We investigated the structures of the PKs 34 by selective 2′ hydroxyl acetylation analysed by primer extension (SHAPE) analysis and 35 determined their contribution to genome replication by mutation and deletion experiments. 36 SHAPE and mutation experiments confirmed the importance of the previously predicted PK 37 structures for their function. Deletion experiments showed that although PKs are not essential 38for replication, they provide genomes with a competitive advantage. However, although 39 replicons and full-length genomes lacking all PKs were replication competent, no infectious 40 virus was rescued from genomes containing less than one PK copy. This is consistent with our 41 earlier report describing the presence of putative packaging signals in the PK region. 42 43 author/funder. All rights reserved. No reuse allowed without permission.'" ] }, - "execution_count": 22, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "article_array[0]['metadata'].keys()" + "articles[0].abstract" ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": 109, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Article Object with id: 00340eea543336d54adda18236424de6a5e91c9d \n", + " title:Analysis Title: Regaining perspective on SARS-CoV-2 molecular tracing and its implications \n", + " abstract:During the past three months, a new coronavirus (SARS-CoV-2) epidemic has been growing exponentially, affecting over 100 thousand people worldwide, and causing enormous distress to economies and societies of affected countries. A plethora of analyses based on viral sequences has already been published, in scientific journals as well as through non-peer reviewed channels, to investigate SARS-CoV-2 genetic heterogeneity and spatiotemporal dissemination. We examined all full genome sequences currently available to assess the presence of sufficient information for reliable phylogenetic and phylogeographic studies. Our analysis clearly shows severe limitations in the present data, in light of which any finding should be considered, at the very best, preliminary and hypothesis-generating. Hence the need for avoiding stigmatization based on partial information, and for continuing concerted efforts to increase number and quality of the sequences required for robust tracing of the epidemic.. CC-BY-NC-ND 4.0 International license It is made available under a author/funder, who has granted medRxiv a license to display the preprint in perpetuity.is the (which was not peer-reviewed) The copyright holder for this preprint ." + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "If printed as a whole, each file would look like the below output:" + "articles[1]" ] }, { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - "\n", - " \"paper_id\": \"f056da9c64fbf00a4645ae326e8a4339d015d155\",\n", - "\n", - " \"metadata\": {\n", - "\n", - " \"title\": \"SIANN: Strain Identification by Alignment to Near Neighbors\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"Samuel\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"S\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Minot\",\n", - "\n", - " \"suffix\": \"\",\n", - "\n", - " \"affiliation\": {},\n", - "\n", - " \"email\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"Stephen\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"D\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Turner\",\n", - "\n", - " \"suffix\": \"\",\n", - "\n", - " \"affiliation\": {},\n", - "\n", - " \"email\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"Krista\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"L\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Ternus\",\n", - "\n", - " \"suffix\": \"\",\n", - "\n", - " \"affiliation\": {},\n", - "\n", - " \"email\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"Dana\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"R\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Kadavy\",\n", - "\n", - " \"suffix\": \"\",\n", - "\n", - " \"affiliation\": {},\n", - "\n", - " \"email\": \"\"\n", - "\n", - " }\n", - "\n", - " ]\n", - "\n", - " },\n", - "\n", - " \"abstract\": [\n", - "\n", - " {\n", - "\n", - " \"text\": \"Next-generation sequencing is increasingly being used to study samples composed of mixtures of organisms, such as in clinical applications where the presence of a pathogen at very low abundance may be highly important. We present an analytical method (SIANN: Strain Identification by Alignment to Near Neighbors) specifically designed to rapidly detect a set of target organisms in mixed samples that achieves a high degree of species-and strain-specificity by aligning short sequence reads to the genomes of near neighbor organisms, as well as that of the target. Empirical benchmarking alongside the current state-of-the-art methods shows an extremely high Positive Predictive Value, even at very low abundances of the target organism in a mixed sample. SIANN is available as an Illumina BaseSpace app, as well as through Signature Science, LLC. SIANN results are presented in a streamlined report designed to be comprehensible to the non-specialist user, providing a powerful tool for rapid species detection in a mixed sample. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Abstract\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"body_text\": [\n", - "\n", - " {\n", - "\n", - " \"text\": \"There are many different methods that characterize the mixture of organisms present within a metagenomic dataset. Such datasets are generated when a complex environmental sample is processed by a \\\"next-generation\\\" high-throughput genome sequencing protocol, and they consist of large numbers of short nucleotide sequences. Each sequence represents a small fragment of a randomly selected genome from the very large collection of genomes present in the source sample. Those sequences indicate the presence of one organism or another according to their similarity to a set of known reference genomes. While a given sequence may be unique to one species, it also may be found in diverse organisms across the tree of life. Therefore, one analytical challenge (among many) is to take that collection of sequences (likely numbering in the millions) and accurately determine what species are present in the sample. Here we describe a novel method (SIANN: Strain Identification by Alignment to Near Neighbors) that is specifically designed to rapidly detect a set of targeted organisms from a metagenomic dataset by aligning reads to genomic regions that are unique at the strain or species level.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Introduction\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The analytical question motivating a particular piece of metagenomic bioinformatic analysis may vary widely by user and sample type (Segata, et al., 2013) . For example, the function of the human gut microbiome may depend on the relative abundance of hundreds of species of bacteria and the types of metabolic genes they contain (Wu, et al., 2011; Schloissnig, et al., 2013) . In contrast, the clinical treatment of a patient may depend on whether or not a particular virus, or a consortium of co-infecting pathogens, is/are detected in their blood. It is this second class of presence/absence questions that SIANN is designed to address. SIANN is appropriate for situations in which a user wants to know whether a particular organism or set of organisms is present in a sample, but isn't interested in the functions encoded in their genomes, the relative abundance of each organism, or any other more in-depth analysis.\",\n", - "\n", - " \"cite_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 132,\n", - "\n", - " \"end\": 154,\n", - "\n", - " \"text\": \"(Segata, et al., 2013)\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 329,\n", - "\n", - " \"end\": 347,\n", - "\n", - " \"text\": \"(Wu, et al., 2011;\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 348,\n", - "\n", - " \"end\": 374,\n", - "\n", - " \"text\": \"Schloissnig, et al., 2013)\",\n", - "\n", - " \"ref_id\": \"BIBREF10\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Introduction\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"Metagenomic classification methods are based on a wide variety of theoretical underpinnings. The basic varieties include alignment of reads to various nucleotide databases or exact matching to nucleotide or protein signature sequences (or kmers). A representative set of recent methods are described in Table 1 (also see Bazinet & Cummings 2012) .\",\n", - "\n", - " \"cite_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 321,\n", - "\n", - " \"end\": 345,\n", - "\n", - " \"text\": \"Bazinet & Cummings 2012)\",\n", - "\n", - " \"ref_id\": \"BIBREF1\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 303,\n", - "\n", - " \"end\": 310,\n", - "\n", - " \"text\": \"Table 1\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Approach\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"Alignment to large nucleotide database Huson, et al., 2011 PhymmBL Alignment to large nucleotide database with interpolated Markov models Brady & Salzberg, 2011 Metaphyler Alignment to clade-specific marker genes Liu, et al., Overall, these methods are designed to either classify individual reads to, and/or predict the total abundance of, clades (e.g. genus or species) across the entire tree of life. They generally require reference databases that are very large and/or require a large amount of processing to generate. The gap SIANN is designed to fill is when the entire tree of life is irrelevant, and only predefined subsets of organisms need to be detected. For an underlying method we chose read alignment to diagnostic genomic regions because the algorithms for read alignment are highly parallelizable and have been optimized heavily by the community at large (the current implementation of SIANN uses bowtie2 [Langmead & Salzberg, 2012] for the alignment function, but can be adapted to any alignment algorithm). This approach is distinct from using cladespecific marker genes (Segata, et al., 2012) because unique regions that are larger, smaller, or outside of genes can also be used. Furthermore, this approach supports the rapid construction of custom databases using reference genome sets that require only minimal user-supplied structure.\",\n", - "\n", - " \"cite_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 39,\n", - "\n", - " \"end\": 66,\n", - "\n", - " \"text\": \"Huson, et al., 2011 PhymmBL\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 138,\n", - "\n", - " \"end\": 160,\n", - "\n", - " \"text\": \"Brady & Salzberg, 2011\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 213,\n", - "\n", - " \"end\": 225,\n", - "\n", - " \"text\": \"Liu, et al.,\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 922,\n", - "\n", - " \"end\": 949,\n", - "\n", - " \"text\": \"[Langmead & Salzberg, 2012]\",\n", - "\n", - " \"ref_id\": \"BIBREF7\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 1090,\n", - "\n", - " \"end\": 1112,\n", - "\n", - " \"text\": \"(Segata, et al., 2012)\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"To understand the principle at work, consider a set of reads that have been aligned to the genomes of several strains belonging to two species. Some regions of those genomes are species-specific, some are strain-specific, and some are shared ( Figure 1a ). When a set of reads is aligned to those genomes such that each read is placed in as many locations as it has a match (at a reasonably stringent threshold), visual inspection of the distribution of reads yields an intuitive understanding of the true source organism as Species I/Strain B (Figure 1b ). If Strain B were not present in the reference database, it would still be clear that the organism was an unknown strain of Species I.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 244,\n", - "\n", - " \"end\": 254,\n", - "\n", - " \"text\": \"Figure 1a\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 545,\n", - "\n", - " \"end\": 555,\n", - "\n", - " \"text\": \"(Figure 1b\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The unique identification of a species or strain is quantified by the proportion of the genome that is determined to be species-or strain-specific (defined as reads that are aligned to regions that are species-or strain-specific). Each species and strain is then assigned a numerical measure of the proportion that is covered by these diagnostic reads, and that proportional measure is compared to the ideal case, where sequences from a single organism (generated in silico) are aligned against the database in an identical manner. After that normalization factor is applied, the resulting score indicates whether the source sample contained any of the organisms in the reference database. Figure 1 . A) For a group of strains belonging to two different species, some regions may be unique to each species (region 1), while other regions may be unique to strains within each species (regions 2 and 3). B) A set of reads are aligned to these genomes, and the ones that align in a species-or strain-specific manner are identified by the combination of genomes to which they align. In this example, Strain B of Species I is the organism identified.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 690,\n", - "\n", - " \"end\": 698,\n", - "\n", - " \"text\": \"Figure 1\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The analysis is conducted independently on both the species and the strain level, so that if the true strain is not present in the database, the species of origin will still be identified. While many methods consider the complete taxonomic tree and assign reads to the least common ancestor, SIANN considers only two taxonomic levels: species and strain, throwing out anything that is not unique at one of those levels and thus obviating many of the confounding factors introduced by manually curated taxonomies.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The example shown in Figure 1b indicates that species-specific reads are identified as reads that align to one species (Species I, in that case) but not the other. If Species II were not present in the example shown in Figure 1b , a much larger number of reads would be assigned as \\\"species-specific,\\\" when in fact those regions are shared with other species. Therefore, the ability of this method to identify strain-and species-specific sequences is a direct function of the inclusion of near neighbors in the reference database. This characteristic is shared among many classification algorithms, but it is of particular note for this method when users have an opportunity to construct their own database.. In order to detect a target species with a high degree of specificity (reducing false positives), it is necessary to include other related species in the reference database. Only by parallel alignment to those near neighbors can the redundant sequences be separated from the species-specific ones. For example, in order to detect Bacillus anthracis in a sample, it would be necessary to include other species of Bacilli in the reference database so that the presence of B. cereus or B. thuringiensis in a sample does not lead to a false call for B. anthracis.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 21,\n", - "\n", - " \"end\": 30,\n", - "\n", - " \"text\": \"Figure 1b\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 219,\n", - "\n", - " \"end\": 228,\n", - "\n", - " \"text\": \"Figure 1b\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The nomenclature of genus, species, and strain is potentially problematic because it does not correspond to a consistent degree of evolutionary distance or genomic distinctiveness. The ability to distinguish two organisms by any method using genomic sequence data is proportional to the amount of each genome that is shared or unique. One might assume that any two organisms of the same species will have a relatively predictable amount of shared genomic identity. However, some pairs of organisms from the same species may have less in common than other pairs of organisms from different species or even genera. This ambiguity impacts SIANN in two ways. If two organisms have very little genomic sequence to distinguish them, the sensitivity of SIANN to detect either one will diminish (the rate of false negatives will increase as the likelihood of sequencing unique regions decreases). Conversely, if an organism is extremely dissimilar to the near neighbors selected for the database, the specificity with which SIANN detects that organism will decline (the rate of false positives will increase as the number of related genomes available in the database decreases). For example, if a database contained only E. coli and B.anthracis, a sample containing B. cereus would be misidentified as contraining B. anthracis. In the intended use case, a database targeting B. anthracis would contain B. cereus and a number of other near neighbors to prevent that kind of misidentification. It would be convenient to say that an ideal database can be made by calculating the ideal genetic distance between all references and then finding an ideal set of organisms to make up that database, but the behavior of any database will be governed by the particular genomes of the organisms it encounters in the wild. Because not all organisms evolve in the same manner (differences in mutation rate, horizontal gene transfer, recombination, etc), the suitability of a database and method to detect a given organism can only be determined by thorough validation and benchmarking, as well as updating the reference database as needed. Users of SIANN may construct their own custom databases to include newly identified genomes or specific subsets of genomes that best suit their research interests.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"Steps to construct a custom database:\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"1. Select a set of target organisms 2. Gather a set of genome sequences for those target organisms as well as a matched set of near neighbors 3. Using those reference genome sequences as an input, SIANN will: a. Construct a reference index for alignment b. Simulate a set of reads from each genome c. Align each of those simulated read sets to all of the reference genomes d. Calculate the proportion of each reference genome that is strain-or species-specific e. [If two organisms do not have a minimal amount of unique sequences that exceeds the rate of sequencing error, SIANN asks that all but one of those organisms are removed from the database to eliminate redundancy. Note that the user can provide a single representative genome with multiple strain names so that the redundant strain names are not lost.]\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The files contained within each SIANN database are a compressed genomic index and a list containing the proportion of each reference genome that was found to be strainor species-specific during database construction.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"To run SIANN:\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"1. Select a pre-made SIANN database and a set of sequences to be analyzed, and 2. SIANN will: a. Align each of the reads against the reference genomes b. Calculate the proportion of each reference genome that is strain-or species-specific within those reads c. Compare that proportion to the simulated ideal case generated during database creation . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint d. Calculate the probability that the given results could be generated by random chance e. Report the normalized proportion and non-parametric statistic of likelihood for each strain and species in the reference database. The normalized proportion of the genome covered by strain-or speciesspecific reads is the primary statistic reported by this tool.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Name Method Reference MEGAN\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The performance of SIANN (version 1.6) was tested in comparison to the following stateof-the-art metagenomic classification programs: LMAT (version 1.2), MetaPhlAn (version 1.7.7), and Kraken (version 0.9.1b). All of the programs in Table 1 were investigated for this effort, and three were chosen based on their ability to run on our high-performance computing cluster with an execution time and memory requirement that would be suitable to a clinical lab. Each program was run on a set of 600 simulated datasets generated by MetaSim (Richter, et al., 2008) . Each dataset consisted of 15,000,000 reads (100bp single-ended) with Illumina-simulated error (fourth-degree polynomial) (Korbel, et al., 2009) . The 600 datasets were broken into 12 sets of 50 replicates. Each of the 12 sets contained organisms at different levels of abundance as shown in Table 2 .\",\n", - "\n", - " \"cite_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 535,\n", - "\n", - " \"end\": 558,\n", - "\n", - " \"text\": \"(Richter, et al., 2008)\",\n", - "\n", - " \"ref_id\": \"BIBREF9\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 682,\n", - "\n", - " \"end\": 704,\n", - "\n", - " \"text\": \"(Korbel, et al., 2009)\",\n", - "\n", - " \"ref_id\": \"BIBREF5\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 233,\n", - "\n", - " \"end\": 240,\n", - "\n", - " \"text\": \"Table 1\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 852,\n", - "\n", - " \"end\": 859,\n", - "\n", - " \"text\": \"Table 2\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Benchmarking\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"Organisms were specifically chosen in pairs so that the ability to distinguish these near neighbors could be determined. The abundances were staggered at 4-fold intervals so that a wide range could be evaluated. All known species of near neighbors for each of the 12 target organisms were included in the reference database used by SIANN for this benchmarking (\\\"Target Pathogen Database\\\") and are shown in Appendix 1.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Benchmarking\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"Each program outputs a distinct measure. Kraken and LMAT both count the reads assigned to each taxon, MetaPhlAn calculates the abundance, and SIANN outputs a measure of the proportion of diagnostic genomic regions present. To put these measures on an even footing, we empirically calculated the false positive rate for each method over all 600 samples, at each possible measure of output. Because each dataset is made up of known organisms, any result can be classified as true or false. Therefore, for any possible result (say, 513 reads classified by LMAT or 1.6% abundance assigned by MetaPhlAn), one can calculate the proportion of calls with at least the same amount of support that were correct (True Positives/[True Positives+False Positives]), over all of the 600 datasets. That measure is commonly given as Positive Predictive Value (PPV). For each program, the results can be translated from the raw value into a PPV that is based on this empirical measure of error. The key item of interest is the PPV value for the results that we know to be true positives, the defined spike organisms. Another way of describing this approach is to say that the results of each program have been normalized to the false positive error rate that was empirically observed. If another set of samples were generated, the PPV vs. raw value curve ( Figure 2 ) would likely fall differently, but in this case it gives us a means of comparing a diverse set of methods against the same ground truth. If method 1 detects an organism with a higher PPV than method 2 does, it means that method 1 has fewer false positives in the range that it reports true positives, which is the definition of utility in this setting.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 1339,\n", - "\n", - " \"end\": 1347,\n", - "\n", - " \"text\": \"Figure 2\",\n", - "\n", - " \"ref_id\": \"FIGREF0\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Benchmarking\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"For each method, PPV was calculated as a function of raw output value. Briefly, this was done by compiling the output for all 600 samples, labeling each result as false or true based on the sample set that it came from, and then calculating (at each possible value of output) what the proportion of TP/[TP+FP] was for results with at least that level of raw output. Some simplification steps were taken, such as focusing on the specieslevel assignments (for comparison with methods that do not perform strain assignment), and only taking the top hit for each species from each dataset. Custom R and BASH scripts were used for the data compilation and analysis.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Benchmarking\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The relationship of raw output value to PPV is shown for each of the four methods in Figure 2 . The point at which PPV is very close to 1 (where 95% of results are true positives) is ~41,000 reads for Kraken, ~2,800 reads for LMAT, ~38% abundance for MetaPhlAn, and 0.21 for SIANN. For SIANN this means that having 38% of the species-unique genome covered by reads resulted in the vast majority of calls being accurate. . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint For readassignment methods (such as LMAT and Kraken), manual inspection of the results may yield a different understanding of confidence than is presented here, or in any automated analysis.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 85,\n", - "\n", - " \"end\": 93,\n", - "\n", - " \"text\": \"Figure 2\",\n", - "\n", - " \"ref_id\": \"FIGREF0\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Results\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"For example, while each read that is assigned by LMAT and Kraken fall above a certain cutoff for speciesspecificity, some individual reads may be much more specific than others. One could identify a read that aligns to a single species of bacteria with 100% accuracy over its 300bp length, with the next closest match being only 90% similar. It is extremely unlikely that a 300bp exact match would arise due to random chance, and so the user could say with confidence that the organism of interest is found within the sequence data (not considering contamination, horizontal gene transfer, etc). However, such an approach is not currently implemented in an automated method, and many of the steps needed to make that assertion are performed manually by a domain expert, including alignment to near neighbors and ensuring that the read does not fall within a transposon, plasmid, etc. Therefore, while one could say that a single read is all that is needed to state with high PPV that an organism is present, the amount of reads assigned in an automated manner needed to achieve that level of PPV will number in the thousands (Fig 2) . Table 2 ), and each program (boxes at top), across a maximum of 50 replicates (indicated by the size of each point). Note that the reference database for MetaPhlAn does not include viruses, and the reference database for Kraken does not include RNA viruses (e.g., Hanta virus).\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 1125,\n", - "\n", - " \"end\": 1132,\n", - "\n", - " \"text\": \"(Fig 2)\",\n", - "\n", - " \"ref_id\": \"FIGREF0\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 1135,\n", - "\n", - " \"end\": 1142,\n", - "\n", - " \"text\": \"Table 2\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Results\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \". CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint The next phase of benchmarking was to determine how many raw input reads were needed to achieve the threshold for high PPV. To demonstrate this we plotted the known abundance of each spike organism against the PPV value generated by each method (Figure 3 ). Each point (an organism at a known level of abundance) is comprised of a maximum of 50 replicates, where the diameter of each point increases with an increasing number of replicates. For demonstration purposes we are showing two pairs of bacteria and three viruses. Recall that for each of the pairs of bacteria (and the two poxviruses) any sample containing one did not contain the other (as shown in Table 1 ). The empty boxes result from the organisms not being called at any abundance. For MetaPhlAn, that is a result of no viruses being included in the version of the reference database available for this analysis. Kraken assigned no reads to Hanta virus because viral RNA genomes were not included in this version of the reference database (personal communication with D. Wood). This emphasizes the point that a) the ability to create custom databases targeting organisms of interest can be valuable, and b) the performance of any method must be benchmarked against each potential target of interest.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 460,\n", - "\n", - " \"end\": 469,\n", - "\n", - " \"text\": \"(Figure 3\",\n", - "\n", - " \"ref_id\": \"FIGREF1\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"start\": 875,\n", - "\n", - " \"end\": 882,\n", - "\n", - " \"text\": \"Table 1\",\n", - "\n", - " \"ref_id\": null\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Results\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"All methods were able identify the bulk of organisms in their databases at high abundances (75% and 18%, Figure 3 ), however performance varied considerably at lower abundances and depended on the particular organism and method used. SIANN detected each organism at high confidence, even at levels as low as 0.3% and 0.07% of the total.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [\n", - "\n", - " {\n", - "\n", - " \"start\": 105,\n", - "\n", - " \"end\": 113,\n", - "\n", - " \"text\": \"Figure 3\",\n", - "\n", - " \"ref_id\": \"FIGREF1\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"section\": \"Results\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"The process of detecting trace amounts of a specific organism in a complex mixture of DNA is challenging enough for an expert, but that pales in comparison to the difficulty of accomplishing the same certainty of detection in an automated manner. The results presented here show that SIANN rapidly detects the presence of a given set of organisms with a high degree of specificity and sensitivity. For example, at the 95% confidence (PPV) cutoff of 0.2, SIANN reliably detects all of the organisms tested here at as low as 0.3% abundance. This strong performance is likely due to the fact that SIANN is able to use a method (read alignment to whole genomes) that would be far too computationally costly if it were applied to the entire collection of known genomes. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results. SIANN is available on Illumina's BaseSpace (www.basespace.illumina.com) as a NativeApp, with the database tested here (Appendix 1), as well as a database made from the NCBI representative set of prokaryotic genomes (ftp://ftp.ncbi.nlm.nih.gov/genomes/genome_reports/) (Appendix 2) and the complete set of NCBI viral genomes (ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/) (Appendix 3).\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Discussion\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \". CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint BaseSpace was chosen as an appropriate release platform because while the entire set of software and dependencies can be deployed by the user from within a graphical user interface, the actual computation takes place in a controlled 'cloud' environment. Such a distribution strategy obviates the need to satisfy the multiple software or OS dependencies that often arises with academic computational methods. Results for SIANN are compiled into a report format, showing both the organisms that surpass 95% confidence, as well as the closest strain match for each species. The default view masks the raw data output, so that the results are human-readable and do not present extraneous information. While the code for execution and databaseconstruction on a users system is available from Signature Science, LLC, additional databases on the BaseSpace platform can be made available upon request.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Discussion\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \"There is a neverending list of questions that one could ask of metagenomic sequencing data generated from important samples. Instead of answering them all, we demonstrate a technique with a very narrow focus that is able to report with a high degree of confidence whether a given set of organisms is present in a sample. These results are presented to the user in a comprehensible format, and accessible on a commonly-used web platform. The world of bioinformatics will continue to progress and develop more sophisticated tools for metagenomic analysis, and we hope that the utility of SIANN will convince others to package and benchmark their tools in a way that they can be used with confidence by the larger public, as well as the research community.\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Discussion\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"text\": \". CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint\",\n", - "\n", - " \"cite_spans\": [],\n", - "\n", - " \"ref_spans\": [],\n", - "\n", - " \"section\": \"Discussion\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"bib_entries\": {\n", - "\n", - " \"BIBREF0\": {\n", - "\n", - " \"ref_id\": \"b0\",\n", - "\n", - " \"title\": \"Scalable metagenomic taxonomy classification using a reference genome database\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"K\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Ames\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"D\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"A\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Hysom\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"N\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Gardner\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"G\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"S\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Lloyd\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"B\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Gokhale\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"Allen\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Je\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2013,\n", - "\n", - " \"venue\": \"Bioinformatics\",\n", - "\n", - " \"volume\": \"29\",\n", - "\n", - " \"issn\": \"18\",\n", - "\n", - " \"pages\": \"2253--60\",\n", - "\n", - " \"other_ids\": {\n", - "\n", - " \"DOI\": [\n", - "\n", - " \"10.1093/bioinformatics/btt389\"\n", - "\n", - " ]\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"BIBREF1\": {\n", - "\n", - " \"ref_id\": \"b1\",\n", - "\n", - " \"title\": \"A comparative evaluation of sequence classification programs\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"A\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"L\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Bazinet\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"P\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Cummings\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2012,\n", - "\n", - " \"venue\": \"BMC Bioinformatics\",\n", - "\n", - " \"volume\": \"13\",\n", - "\n", - " \"issn\": \"\",\n", - "\n", - " \"pages\": \"\",\n", - "\n", - " \"other_ids\": {\n", - "\n", - " \"DOI\": [\n", - "\n", - " \"10.1186/1471-2105-13-92\"\n", - "\n", - " ]\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"BIBREF2\": {\n", - "\n", - " \"ref_id\": \"b2\",\n", - "\n", - " \"title\": \"Rapid phylogenetic and functional classification of short genomic fragments with signature peptides\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"J\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Berendzen\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"W\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"J\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Bruno\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"J\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"D\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Cohn\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"N\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"W\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Hengartner\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"C\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"R\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Kuske\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"B\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"H\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Mcmahon\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"A\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Wolinsky\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"G\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Xie\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2012,\n", - "\n", - " \"venue\": \"BMC Research Notes\",\n", - "\n", - " \"volume\": \"5\",\n", - "\n", - " \"issn\": \"\",\n", - "\n", - " \"pages\": \"\",\n", - "\n", - " \"other_ids\": {\n", - "\n", - " \"DOI\": [\n", - "\n", - " \"10.1186/1756-0500-5-460\"\n", - "\n", - " ]\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"BIBREF3\": {\n", - "\n", - " \"ref_id\": \"b3\",\n", - "\n", - " \"title\": \"Phymm and PhymmBL: metagenomic phylogenetic classification with interpolated Markov models\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"A\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Brady\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"L\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Salzberg\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2009,\n", - "\n", - " \"venue\": \"Nature Methods\",\n", - "\n", - " \"volume\": \"6\",\n", - "\n", - " \"issn\": \"9\",\n", - "\n", - " \"pages\": \"673--679\",\n", - "\n", - " \"other_ids\": {\n", - "\n", - " \"DOI\": [\n", - "\n", - " \"10.1038/nmeth.1358\"\n", - "\n", - " ]\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"BIBREF4\": {\n", - "\n", - " \"ref_id\": \"b4\",\n", - "\n", - " \"title\": \"Integrative analysis of environmental sequences using MEGAN4\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"D\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"H\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Huson\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Mitra\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"N\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Weber\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"H\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Ruscheweyh\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"C\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Schuster\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2011,\n", - "\n", - " \"venue\": \"Genome Research\",\n", - "\n", - " \"volume\": \"21\",\n", - "\n", - " \"issn\": \"\",\n", - "\n", - " \"pages\": \"1552--1560\",\n", - "\n", - " \"other_ids\": {}\n", - "\n", - " },\n", - "\n", - " \"BIBREF5\": {\n", - "\n", - " \"ref_id\": \"b5\",\n", - "\n", - " \"title\": \"PEMer: a computational framework with simulation-based error models for inferring genomic structural variants from massive paired-end sequencing data\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"J\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"O\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Korbel\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"A\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Abyzov\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"X\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"J\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Mu\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"N\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Carriero\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"P\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Cayting\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"Z\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Zhang\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Snyder\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"B\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Gerstein\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2009,\n", - "\n", - " \"venue\": \"\",\n", - "\n", - " \"volume\": \"\",\n", - "\n", - " \"issn\": \"\",\n", - "\n", - " \"pages\": \"\",\n", - "\n", - " \"other_ids\": {}\n", - "\n", - " },\n", - "\n", - " \"BIBREF7\": {\n", - "\n", - " \"ref_id\": \"b7\",\n", - "\n", - " \"title\": \"Fast gapped-read alignment with Bowtie 2\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"B\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Langmead\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Salzberg\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2012,\n", - "\n", - " \"venue\": \"Nature Methods\",\n", - "\n", - " \"volume\": \"9\",\n", - "\n", - " \"issn\": \"\",\n", - "\n", - " \"pages\": \"357--359\",\n", - "\n", - " \"other_ids\": {}\n", - "\n", - " },\n", - "\n", - " \"BIBREF8\": {\n", - "\n", - " \"ref_id\": \"b8\",\n", - "\n", - " \"title\": \"Accurate and fast estimation of taxonomic profiles from metagenomic shotgun sequences\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"B\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Liu\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"T\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Gibbons\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Ghodsi\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"T\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Treangen\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Pop\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2011,\n", - "\n", - " \"venue\": \"BMC Genomics\",\n", - "\n", - " \"volume\": \"12\",\n", - "\n", - " \"issn\": \"2\",\n", - "\n", - " \"pages\": \"\",\n", - "\n", - " \"other_ids\": {}\n", - "\n", - " },\n", - "\n", - " \"BIBREF9\": {\n", - "\n", - " \"ref_id\": \"b9\",\n", - "\n", - " \"title\": \"MetaSim: a sequencing simulator for genomics and metagenomics\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"D\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"C\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Richter\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"F\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Ott\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"A\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"F\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Auch\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"R\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Schmid\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"D\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"H\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Huson\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2008,\n", - "\n", - " \"venue\": \"PLoS One\",\n", - "\n", - " \"volume\": \"3\",\n", - "\n", - " \"issn\": \"10\",\n", - "\n", - " \"pages\": \"\",\n", - "\n", - " \"other_ids\": {\n", - "\n", - " \"DOI\": [\n", - "\n", - " \"10.1371/journal.pone.0003373\"\n", - "\n", - " ]\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"BIBREF10\": {\n", - "\n", - " \"ref_id\": \"b10\",\n", - "\n", - " \"title\": \"Genomic variation landscape of the human gut microbiome\",\n", - "\n", - " \"authors\": [\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Schloissnig\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Arumugam\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Sunagawa\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"M\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Mitreva\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"J\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Tap\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"A\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Zhu\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"A\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Waller\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"D\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"R\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Mende\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"J\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"R\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Kultima\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"J\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Martin\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"K\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Kota\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"S\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"R\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Sunyaev\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"G\",\n", - "\n", - " \"middle\": [\n", - "\n", - " \"M\"\n", - "\n", - " ],\n", - "\n", - " \"last\": \"Weinstock\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " },\n", - "\n", - " {\n", - "\n", - " \"first\": \"P\",\n", - "\n", - " \"middle\": [],\n", - "\n", - " \"last\": \"Bork\",\n", - "\n", - " \"suffix\": \"\"\n", - "\n", - " }\n", - "\n", - " ],\n", - "\n", - " \"year\": 2013,\n", - "\n", - " \"venue\": \"Nature\",\n", - "\n", - " \"volume\": \"493\",\n", - "\n", - " \"issn\": \"7430\",\n", - "\n", - " \"pages\": \"45--50\",\n", - "\n", - " \"other_ids\": {\n", - "\n", - " \"DOI\": [\n", - "\n", - " \"10.1038/nature11711\"\n", - "\n", - " ]\n", - "\n", - " }\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"ref_entries\": {\n", - "\n", - " \"FIGREF0\": {\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " \"text\": \"Relationship of reported value for each program (horizontal axis, log scale) to the empirically-determined Positive Predictive Value (PPV), shown on the vertical axis. While the exact values depend on the test data used, the general values at significant cutoff values (0.8, 0.9, 0.95 PPV) remain relatively constant across different datasets (data not shown).\",\n", - "\n", - " \"latex\": null,\n", - "\n", - " \"type\": \"figure\"\n", - "\n", - " },\n", - "\n", - " \"FIGREF1\": {\n", - "\n", - " \"text\": \"The Positive Predictive Value (PPV, vertical axis) is shown for each organism (boxes on right), at each level of known abundance (horizontal axis, see\",\n", - "\n", - " \"latex\": null,\n", - "\n", - " \"type\": \"figure\"\n", - "\n", - " },\n", - "\n", - " \"TABREF0\": {\n", - "\n", - " \"text\": \". Summary of methods for metagenomic classification.\",\n", - "\n", - " \"latex\": null,\n", - "\n", - " \"type\": \"table\"\n", - "\n", - " },\n", - "\n", - " \"TABREF1\": {\n", - "\n", - " \"text\": \"Table 2. The abundance of each target organism in each set of simulated datasets. Each set is indicated by the number in the top row, and was generated with 50 replicates.\",\n", - "\n", - " \"latex\": null,\n", - "\n", - " \"type\": \"table\"\n", - "\n", - " },\n", - "\n", - " \"TABREF2\": {\n", - "\n", - " \"text\": \"Segata N, Waldron L, Ballarini A, Narasimhan V,Jousson O, and Huttenhower C (2012). Metagenomic microbial community profiling using unique clade-specific marker genes. NatureMethods, 9(8):811-4. doi: 10.1038/nmeth.2066. Sunagawa S, Mende DR, Zeller G, Izquierdo-Carrasco F, Berger SA, Kultima JR, Coelho LP, Arumugam M, Tap J, Nielsen HB, Rasmussen S, Brunak S, Pedersen O, Guarner F, de Vos WM, Wang J, Li J, Dor\\u00e9 J, Ehrlich SD, Stamatakis A & Bork P (2013). Metagenomic species profiling using universal phylogenetic marker genes. Nature Methods, 10, 1196-1199 (2013) doi:10.1038/nmeth.2693 Wood DE and Salzberg SL. Ultrafast metagenomic sequence classification using exact alignments. In submission. Wu GD, Chen J, Hoffmann C, Bittinger K, Chen YY, Keilbaugh SA, Bewtra M, Knights D, Walters WA, Knight R, Sinha R, Gilroy E, Gupta K, Baldassano R, Nessel L, Li H, Bushman FD, Lewis JD (2011). Linking long-term dietary patterns with gut microbial enterotypes. Science. 334(6052):105-8. doi: 10.1126/science.1208344. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Appendix 2: Viral Database Abaca bunchy top virus DNA-C Abaca bunchy top virus DNA-M Abaca bunchy top virus DNA-The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101Circovirus-like genome BBC-A Circovirus-like genome CB-A Circovirus-like genome CB-B Circovirus-like genome RW-A Circovirus-like genome RW-B Circovirus-like genome RW-C Circovirus-like genome RW-D Circovirus-like genome RW-E Circovirus-like genome SAR-A Circovirus-like genome SAR-B Citrus psorosis virus RNA1 Citrus psorosis virus RNA2 Citrus psorosis virus RNA3The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprintThe copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprintThe copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder.It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Thermovirga lienii DSM 17291 . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Thermus scotoductus SA 01 Thermus sp CCB US3 UF1 Thermus thermophilus HB27 Vibrio cholerae O1 biovar El Tor str N16961 Vibrio fischeri ES114 Vibrio furnissii NCTC 11218 Vibrio harveyi ATCC BAA 1116 . CC-BY-NC-ND 4.0 International license is made available under a\",\n", - "\n", - " \"latex\": null,\n", - "\n", - " \"type\": \"table\"\n", - "\n", - " }\n", - "\n", - " },\n", - "\n", - " \"back_matter\": []\n", - "\n", - "}\n" - ] - } - ], + "cell_type": "markdown", + "metadata": {}, "source": [ - "f = open('f056da9c64fbf00a4645ae326e8a4339d015d155.json')\n", - "for line in f:\n", - " print(line)" + "# Data Cleaning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## How to load and read a json file" + "Now that we have explored the structure of the dataset, let us focus on our areas of interest as mentioned earlier: namely the Title, Abstract and the body of the text.\\\n", + "We know that we can access these for a given paper by scanning the metadata for a paper of choice for its title and abstract, and if we need it, using its `sha` column value to get the full text of the paper." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First let us clean the metadata by dropping all NA valued rows for the title, sha and abstract columns.\n", + "\n", + "[Pandas filter rows](https://stackoverflow.com/questions/17071871/how-to-select-rows-from-a-dataframe-based-on-column-values)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "bio = meta.loc[meta['source_x'] == 'biorxiv']" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'paper_id': 'f056da9c64fbf00a4645ae326e8a4339d015d155', 'metadata': {'title': 'SIANN: Strain Identification by Alignment to Near Neighbors', 'authors': [{'first': 'Samuel', 'middle': ['S'], 'last': 'Minot', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Stephen', 'middle': ['D'], 'last': 'Turner', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Krista', 'middle': ['L'], 'last': 'Ternus', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Dana', 'middle': ['R'], 'last': 'Kadavy', 'suffix': '', 'affiliation': {}, 'email': ''}]}, 'abstract': [{'text': 'Next-generation sequencing is increasingly being used to study samples composed of mixtures of organisms, such as in clinical applications where the presence of a pathogen at very low abundance may be highly important. We present an analytical method (SIANN: Strain Identification by Alignment to Near Neighbors) specifically designed to rapidly detect a set of target organisms in mixed samples that achieves a high degree of species-and strain-specificity by aligning short sequence reads to the genomes of near neighbor organisms, as well as that of the target. Empirical benchmarking alongside the current state-of-the-art methods shows an extremely high Positive Predictive Value, even at very low abundances of the target organism in a mixed sample. SIANN is available as an Illumina BaseSpace app, as well as through Signature Science, LLC. SIANN results are presented in a streamlined report designed to be comprehensible to the non-specialist user, providing a powerful tool for rapid species detection in a mixed sample. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': 'There are many different methods that characterize the mixture of organisms present within a metagenomic dataset. Such datasets are generated when a complex environmental sample is processed by a \"next-generation\" high-throughput genome sequencing protocol, and they consist of large numbers of short nucleotide sequences. Each sequence represents a small fragment of a randomly selected genome from the very large collection of genomes present in the source sample. Those sequences indicate the presence of one organism or another according to their similarity to a set of known reference genomes. While a given sequence may be unique to one species, it also may be found in diverse organisms across the tree of life. Therefore, one analytical challenge (among many) is to take that collection of sequences (likely numbering in the millions) and accurately determine what species are present in the sample. Here we describe a novel method (SIANN: Strain Identification by Alignment to Near Neighbors) that is specifically designed to rapidly detect a set of targeted organisms from a metagenomic dataset by aligning reads to genomic regions that are unique at the strain or species level.', 'cite_spans': [], 'ref_spans': [], 'section': 'Introduction'}, {'text': \"The analytical question motivating a particular piece of metagenomic bioinformatic analysis may vary widely by user and sample type (Segata, et al., 2013) . For example, the function of the human gut microbiome may depend on the relative abundance of hundreds of species of bacteria and the types of metabolic genes they contain (Wu, et al., 2011; Schloissnig, et al., 2013) . In contrast, the clinical treatment of a patient may depend on whether or not a particular virus, or a consortium of co-infecting pathogens, is/are detected in their blood. It is this second class of presence/absence questions that SIANN is designed to address. SIANN is appropriate for situations in which a user wants to know whether a particular organism or set of organisms is present in a sample, but isn't interested in the functions encoded in their genomes, the relative abundance of each organism, or any other more in-depth analysis.\", 'cite_spans': [{'start': 132, 'end': 154, 'text': '(Segata, et al., 2013)', 'ref_id': None}, {'start': 329, 'end': 347, 'text': '(Wu, et al., 2011;', 'ref_id': None}, {'start': 348, 'end': 374, 'text': 'Schloissnig, et al., 2013)', 'ref_id': 'BIBREF10'}], 'ref_spans': [], 'section': 'Introduction'}, {'text': 'Metagenomic classification methods are based on a wide variety of theoretical underpinnings. The basic varieties include alignment of reads to various nucleotide databases or exact matching to nucleotide or protein signature sequences (or kmers). A representative set of recent methods are described in Table 1 (also see Bazinet & Cummings 2012) .', 'cite_spans': [{'start': 321, 'end': 345, 'text': 'Bazinet & Cummings 2012)', 'ref_id': 'BIBREF1'}], 'ref_spans': [{'start': 303, 'end': 310, 'text': 'Table 1', 'ref_id': None}], 'section': 'Approach'}, {'text': 'Alignment to large nucleotide database Huson, et al., 2011 PhymmBL Alignment to large nucleotide database with interpolated Markov models Brady & Salzberg, 2011 Metaphyler Alignment to clade-specific marker genes Liu, et al., Overall, these methods are designed to either classify individual reads to, and/or predict the total abundance of, clades (e.g. genus or species) across the entire tree of life. They generally require reference databases that are very large and/or require a large amount of processing to generate. The gap SIANN is designed to fill is when the entire tree of life is irrelevant, and only predefined subsets of organisms need to be detected. For an underlying method we chose read alignment to diagnostic genomic regions because the algorithms for read alignment are highly parallelizable and have been optimized heavily by the community at large (the current implementation of SIANN uses bowtie2 [Langmead & Salzberg, 2012] for the alignment function, but can be adapted to any alignment algorithm). This approach is distinct from using cladespecific marker genes (Segata, et al., 2012) because unique regions that are larger, smaller, or outside of genes can also be used. Furthermore, this approach supports the rapid construction of custom databases using reference genome sets that require only minimal user-supplied structure.', 'cite_spans': [{'start': 39, 'end': 66, 'text': 'Huson, et al., 2011 PhymmBL', 'ref_id': None}, {'start': 138, 'end': 160, 'text': 'Brady & Salzberg, 2011', 'ref_id': None}, {'start': 213, 'end': 225, 'text': 'Liu, et al.,', 'ref_id': None}, {'start': 922, 'end': 949, 'text': '[Langmead & Salzberg, 2012]', 'ref_id': 'BIBREF7'}, {'start': 1090, 'end': 1112, 'text': '(Segata, et al., 2012)', 'ref_id': None}], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'To understand the principle at work, consider a set of reads that have been aligned to the genomes of several strains belonging to two species. Some regions of those genomes are species-specific, some are strain-specific, and some are shared ( Figure 1a ). When a set of reads is aligned to those genomes such that each read is placed in as many locations as it has a match (at a reasonably stringent threshold), visual inspection of the distribution of reads yields an intuitive understanding of the true source organism as Species I/Strain B (Figure 1b ). If Strain B were not present in the reference database, it would still be clear that the organism was an unknown strain of Species I.', 'cite_spans': [], 'ref_spans': [{'start': 244, 'end': 254, 'text': 'Figure 1a', 'ref_id': None}, {'start': 545, 'end': 555, 'text': '(Figure 1b', 'ref_id': None}], 'section': 'Name Method Reference MEGAN'}, {'text': 'The unique identification of a species or strain is quantified by the proportion of the genome that is determined to be species-or strain-specific (defined as reads that are aligned to regions that are species-or strain-specific). Each species and strain is then assigned a numerical measure of the proportion that is covered by these diagnostic reads, and that proportional measure is compared to the ideal case, where sequences from a single organism (generated in silico) are aligned against the database in an identical manner. After that normalization factor is applied, the resulting score indicates whether the source sample contained any of the organisms in the reference database. Figure 1 . A) For a group of strains belonging to two different species, some regions may be unique to each species (region 1), while other regions may be unique to strains within each species (regions 2 and 3). B) A set of reads are aligned to these genomes, and the ones that align in a species-or strain-specific manner are identified by the combination of genomes to which they align. In this example, Strain B of Species I is the organism identified.', 'cite_spans': [], 'ref_spans': [{'start': 690, 'end': 698, 'text': 'Figure 1', 'ref_id': None}], 'section': 'Name Method Reference MEGAN'}, {'text': 'The analysis is conducted independently on both the species and the strain level, so that if the true strain is not present in the database, the species of origin will still be identified. While many methods consider the complete taxonomic tree and assign reads to the least common ancestor, SIANN considers only two taxonomic levels: species and strain, throwing out anything that is not unique at one of those levels and thus obviating many of the confounding factors introduced by manually curated taxonomies.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'The example shown in Figure 1b indicates that species-specific reads are identified as reads that align to one species (Species I, in that case) but not the other. If Species II were not present in the example shown in Figure 1b , a much larger number of reads would be assigned as \"species-specific,\" when in fact those regions are shared with other species. Therefore, the ability of this method to identify strain-and species-specific sequences is a direct function of the inclusion of near neighbors in the reference database. This characteristic is shared among many classification algorithms, but it is of particular note for this method when users have an opportunity to construct their own database.. In order to detect a target species with a high degree of specificity (reducing false positives), it is necessary to include other related species in the reference database. Only by parallel alignment to those near neighbors can the redundant sequences be separated from the species-specific ones. For example, in order to detect Bacillus anthracis in a sample, it would be necessary to include other species of Bacilli in the reference database so that the presence of B. cereus or B. thuringiensis in a sample does not lead to a false call for B. anthracis.', 'cite_spans': [], 'ref_spans': [{'start': 21, 'end': 30, 'text': 'Figure 1b', 'ref_id': None}, {'start': 219, 'end': 228, 'text': 'Figure 1b', 'ref_id': None}], 'section': 'Name Method Reference MEGAN'}, {'text': 'The nomenclature of genus, species, and strain is potentially problematic because it does not correspond to a consistent degree of evolutionary distance or genomic distinctiveness. The ability to distinguish two organisms by any method using genomic sequence data is proportional to the amount of each genome that is shared or unique. One might assume that any two organisms of the same species will have a relatively predictable amount of shared genomic identity. However, some pairs of organisms from the same species may have less in common than other pairs of organisms from different species or even genera. This ambiguity impacts SIANN in two ways. If two organisms have very little genomic sequence to distinguish them, the sensitivity of SIANN to detect either one will diminish (the rate of false negatives will increase as the likelihood of sequencing unique regions decreases). Conversely, if an organism is extremely dissimilar to the near neighbors selected for the database, the specificity with which SIANN detects that organism will decline (the rate of false positives will increase as the number of related genomes available in the database decreases). For example, if a database contained only E. coli and B.anthracis, a sample containing B. cereus would be misidentified as contraining B. anthracis. In the intended use case, a database targeting B. anthracis would contain B. cereus and a number of other near neighbors to prevent that kind of misidentification. It would be convenient to say that an ideal database can be made by calculating the ideal genetic distance between all references and then finding an ideal set of organisms to make up that database, but the behavior of any database will be governed by the particular genomes of the organisms it encounters in the wild. Because not all organisms evolve in the same manner (differences in mutation rate, horizontal gene transfer, recombination, etc), the suitability of a database and method to detect a given organism can only be determined by thorough validation and benchmarking, as well as updating the reference database as needed. Users of SIANN may construct their own custom databases to include newly identified genomes or specific subsets of genomes that best suit their research interests.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'Steps to construct a custom database:', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': '1. Select a set of target organisms 2. Gather a set of genome sequences for those target organisms as well as a matched set of near neighbors 3. Using those reference genome sequences as an input, SIANN will: a. Construct a reference index for alignment b. Simulate a set of reads from each genome c. Align each of those simulated read sets to all of the reference genomes d. Calculate the proportion of each reference genome that is strain-or species-specific e. [If two organisms do not have a minimal amount of unique sequences that exceeds the rate of sequencing error, SIANN asks that all but one of those organisms are removed from the database to eliminate redundancy. Note that the user can provide a single representative genome with multiple strain names so that the redundant strain names are not lost.]', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'The files contained within each SIANN database are a compressed genomic index and a list containing the proportion of each reference genome that was found to be strainor species-specific during database construction.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'To run SIANN:', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': '1. Select a pre-made SIANN database and a set of sequences to be analyzed, and 2. SIANN will: a. Align each of the reads against the reference genomes b. Calculate the proportion of each reference genome that is strain-or species-specific within those reads c. Compare that proportion to the simulated ideal case generated during database creation . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint d. Calculate the probability that the given results could be generated by random chance e. Report the normalized proportion and non-parametric statistic of likelihood for each strain and species in the reference database. The normalized proportion of the genome covered by strain-or speciesspecific reads is the primary statistic reported by this tool.', 'cite_spans': [], 'ref_spans': [], 'section': 'Name Method Reference MEGAN'}, {'text': 'The performance of SIANN (version 1.6) was tested in comparison to the following stateof-the-art metagenomic classification programs: LMAT (version 1.2), MetaPhlAn (version 1.7.7), and Kraken (version 0.9.1b). All of the programs in Table 1 were investigated for this effort, and three were chosen based on their ability to run on our high-performance computing cluster with an execution time and memory requirement that would be suitable to a clinical lab. Each program was run on a set of 600 simulated datasets generated by MetaSim (Richter, et al., 2008) . Each dataset consisted of 15,000,000 reads (100bp single-ended) with Illumina-simulated error (fourth-degree polynomial) (Korbel, et al., 2009) . The 600 datasets were broken into 12 sets of 50 replicates. Each of the 12 sets contained organisms at different levels of abundance as shown in Table 2 .', 'cite_spans': [{'start': 535, 'end': 558, 'text': '(Richter, et al., 2008)', 'ref_id': 'BIBREF9'}, {'start': 682, 'end': 704, 'text': '(Korbel, et al., 2009)', 'ref_id': 'BIBREF5'}], 'ref_spans': [{'start': 233, 'end': 240, 'text': 'Table 1', 'ref_id': None}, {'start': 852, 'end': 859, 'text': 'Table 2', 'ref_id': None}], 'section': 'Benchmarking'}, {'text': 'Organisms were specifically chosen in pairs so that the ability to distinguish these near neighbors could be determined. The abundances were staggered at 4-fold intervals so that a wide range could be evaluated. All known species of near neighbors for each of the 12 target organisms were included in the reference database used by SIANN for this benchmarking (\"Target Pathogen Database\") and are shown in Appendix 1.', 'cite_spans': [], 'ref_spans': [], 'section': 'Benchmarking'}, {'text': 'Each program outputs a distinct measure. Kraken and LMAT both count the reads assigned to each taxon, MetaPhlAn calculates the abundance, and SIANN outputs a measure of the proportion of diagnostic genomic regions present. To put these measures on an even footing, we empirically calculated the false positive rate for each method over all 600 samples, at each possible measure of output. Because each dataset is made up of known organisms, any result can be classified as true or false. Therefore, for any possible result (say, 513 reads classified by LMAT or 1.6% abundance assigned by MetaPhlAn), one can calculate the proportion of calls with at least the same amount of support that were correct (True Positives/[True Positives+False Positives]), over all of the 600 datasets. That measure is commonly given as Positive Predictive Value (PPV). For each program, the results can be translated from the raw value into a PPV that is based on this empirical measure of error. The key item of interest is the PPV value for the results that we know to be true positives, the defined spike organisms. Another way of describing this approach is to say that the results of each program have been normalized to the false positive error rate that was empirically observed. If another set of samples were generated, the PPV vs. raw value curve ( Figure 2 ) would likely fall differently, but in this case it gives us a means of comparing a diverse set of methods against the same ground truth. If method 1 detects an organism with a higher PPV than method 2 does, it means that method 1 has fewer false positives in the range that it reports true positives, which is the definition of utility in this setting.', 'cite_spans': [], 'ref_spans': [{'start': 1339, 'end': 1347, 'text': 'Figure 2', 'ref_id': 'FIGREF0'}], 'section': 'Benchmarking'}, {'text': 'For each method, PPV was calculated as a function of raw output value. Briefly, this was done by compiling the output for all 600 samples, labeling each result as false or true based on the sample set that it came from, and then calculating (at each possible value of output) what the proportion of TP/[TP+FP] was for results with at least that level of raw output. Some simplification steps were taken, such as focusing on the specieslevel assignments (for comparison with methods that do not perform strain assignment), and only taking the top hit for each species from each dataset. Custom R and BASH scripts were used for the data compilation and analysis.', 'cite_spans': [], 'ref_spans': [], 'section': 'Benchmarking'}, {'text': 'The relationship of raw output value to PPV is shown for each of the four methods in Figure 2 . The point at which PPV is very close to 1 (where 95% of results are true positives) is ~41,000 reads for Kraken, ~2,800 reads for LMAT, ~38% abundance for MetaPhlAn, and 0.21 for SIANN. For SIANN this means that having 38% of the species-unique genome covered by reads resulted in the vast majority of calls being accurate. . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint For readassignment methods (such as LMAT and Kraken), manual inspection of the results may yield a different understanding of confidence than is presented here, or in any automated analysis.', 'cite_spans': [], 'ref_spans': [{'start': 85, 'end': 93, 'text': 'Figure 2', 'ref_id': 'FIGREF0'}], 'section': 'Results'}, {'text': 'For example, while each read that is assigned by LMAT and Kraken fall above a certain cutoff for speciesspecificity, some individual reads may be much more specific than others. One could identify a read that aligns to a single species of bacteria with 100% accuracy over its 300bp length, with the next closest match being only 90% similar. It is extremely unlikely that a 300bp exact match would arise due to random chance, and so the user could say with confidence that the organism of interest is found within the sequence data (not considering contamination, horizontal gene transfer, etc). However, such an approach is not currently implemented in an automated method, and many of the steps needed to make that assertion are performed manually by a domain expert, including alignment to near neighbors and ensuring that the read does not fall within a transposon, plasmid, etc. Therefore, while one could say that a single read is all that is needed to state with high PPV that an organism is present, the amount of reads assigned in an automated manner needed to achieve that level of PPV will number in the thousands (Fig 2) . Table 2 ), and each program (boxes at top), across a maximum of 50 replicates (indicated by the size of each point). Note that the reference database for MetaPhlAn does not include viruses, and the reference database for Kraken does not include RNA viruses (e.g., Hanta virus).', 'cite_spans': [], 'ref_spans': [{'start': 1125, 'end': 1132, 'text': '(Fig 2)', 'ref_id': 'FIGREF0'}, {'start': 1135, 'end': 1142, 'text': 'Table 2', 'ref_id': None}], 'section': 'Results'}, {'text': '. CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint The next phase of benchmarking was to determine how many raw input reads were needed to achieve the threshold for high PPV. To demonstrate this we plotted the known abundance of each spike organism against the PPV value generated by each method (Figure 3 ). Each point (an organism at a known level of abundance) is comprised of a maximum of 50 replicates, where the diameter of each point increases with an increasing number of replicates. For demonstration purposes we are showing two pairs of bacteria and three viruses. Recall that for each of the pairs of bacteria (and the two poxviruses) any sample containing one did not contain the other (as shown in Table 1 ). The empty boxes result from the organisms not being called at any abundance. For MetaPhlAn, that is a result of no viruses being included in the version of the reference database available for this analysis. Kraken assigned no reads to Hanta virus because viral RNA genomes were not included in this version of the reference database (personal communication with D. Wood). This emphasizes the point that a) the ability to create custom databases targeting organisms of interest can be valuable, and b) the performance of any method must be benchmarked against each potential target of interest.', 'cite_spans': [], 'ref_spans': [{'start': 460, 'end': 469, 'text': '(Figure 3', 'ref_id': 'FIGREF1'}, {'start': 875, 'end': 882, 'text': 'Table 1', 'ref_id': None}], 'section': 'Results'}, {'text': 'All methods were able identify the bulk of organisms in their databases at high abundances (75% and 18%, Figure 3 ), however performance varied considerably at lower abundances and depended on the particular organism and method used. SIANN detected each organism at high confidence, even at levels as low as 0.3% and 0.07% of the total.', 'cite_spans': [], 'ref_spans': [{'start': 105, 'end': 113, 'text': 'Figure 3', 'ref_id': 'FIGREF1'}], 'section': 'Results'}, {'text': \"The process of detecting trace amounts of a specific organism in a complex mixture of DNA is challenging enough for an expert, but that pales in comparison to the difficulty of accomplishing the same certainty of detection in an automated manner. The results presented here show that SIANN rapidly detects the presence of a given set of organisms with a high degree of specificity and sensitivity. For example, at the 95% confidence (PPV) cutoff of 0.2, SIANN reliably detects all of the organisms tested here at as low as 0.3% abundance. This strong performance is likely due to the fact that SIANN is able to use a method (read alignment to whole genomes) that would be far too computationally costly if it were applied to the entire collection of known genomes. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results. SIANN is available on Illumina's BaseSpace (www.basespace.illumina.com) as a NativeApp, with the database tested here (Appendix 1), as well as a database made from the NCBI representative set of prokaryotic genomes (ftp://ftp.ncbi.nlm.nih.gov/genomes/genome_reports/) (Appendix 2) and the complete set of NCBI viral genomes (ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/) (Appendix 3).\", 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': \". CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint BaseSpace was chosen as an appropriate release platform because while the entire set of software and dependencies can be deployed by the user from within a graphical user interface, the actual computation takes place in a controlled 'cloud' environment. Such a distribution strategy obviates the need to satisfy the multiple software or OS dependencies that often arises with academic computational methods. Results for SIANN are compiled into a report format, showing both the organisms that surpass 95% confidence, as well as the closest strain match for each species. The default view masks the raw data output, so that the results are human-readable and do not present extraneous information. While the code for execution and databaseconstruction on a users system is available from Signature Science, LLC, additional databases on the BaseSpace platform can be made available upon request.\", 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': 'There is a neverending list of questions that one could ask of metagenomic sequencing data generated from important samples. Instead of answering them all, we demonstrate a technique with a very narrow focus that is able to report with a high degree of confidence whether a given set of organisms is present in a sample. These results are presented to the user in a comprehensible format, and accessible on a commonly-used web platform. The world of bioinformatics will continue to progress and develop more sophisticated tools for metagenomic analysis, and we hope that the utility of SIANN will convince others to package and benchmark their tools in a way that they can be used with confidence by the larger public, as well as the research community.', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}, {'text': '. CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'Discussion'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': 'Scalable metagenomic taxonomy classification using a reference genome database', 'authors': [{'first': 'S', 'middle': ['K'], 'last': 'Ames', 'suffix': ''}, {'first': 'D', 'middle': ['A'], 'last': 'Hysom', 'suffix': ''}, {'first': 'S', 'middle': ['N'], 'last': 'Gardner', 'suffix': ''}, {'first': 'G', 'middle': ['S'], 'last': 'Lloyd', 'suffix': ''}, {'first': 'M', 'middle': ['B'], 'last': 'Gokhale', 'suffix': ''}, {'first': 'Allen', 'middle': [], 'last': 'Je', 'suffix': ''}], 'year': 2013, 'venue': 'Bioinformatics', 'volume': '29', 'issn': '18', 'pages': '2253--60', 'other_ids': {'DOI': ['10.1093/bioinformatics/btt389']}}, 'BIBREF1': {'ref_id': 'b1', 'title': 'A comparative evaluation of sequence classification programs', 'authors': [{'first': 'A', 'middle': ['L'], 'last': 'Bazinet', 'suffix': ''}, {'first': 'M', 'middle': ['P'], 'last': 'Cummings', 'suffix': ''}], 'year': 2012, 'venue': 'BMC Bioinformatics', 'volume': '13', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1186/1471-2105-13-92']}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'Rapid phylogenetic and functional classification of short genomic fragments with signature peptides', 'authors': [{'first': 'J', 'middle': [], 'last': 'Berendzen', 'suffix': ''}, {'first': 'W', 'middle': ['J'], 'last': 'Bruno', 'suffix': ''}, {'first': 'J', 'middle': ['D'], 'last': 'Cohn', 'suffix': ''}, {'first': 'N', 'middle': ['W'], 'last': 'Hengartner', 'suffix': ''}, {'first': 'C', 'middle': ['R'], 'last': 'Kuske', 'suffix': ''}, {'first': 'B', 'middle': ['H'], 'last': 'Mcmahon', 'suffix': ''}, {'first': 'M', 'middle': ['A'], 'last': 'Wolinsky', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Xie', 'suffix': ''}], 'year': 2012, 'venue': 'BMC Research Notes', 'volume': '5', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1186/1756-0500-5-460']}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Phymm and PhymmBL: metagenomic phylogenetic classification with interpolated Markov models', 'authors': [{'first': 'A', 'middle': [], 'last': 'Brady', 'suffix': ''}, {'first': 'S', 'middle': ['L'], 'last': 'Salzberg', 'suffix': ''}], 'year': 2009, 'venue': 'Nature Methods', 'volume': '6', 'issn': '9', 'pages': '673--679', 'other_ids': {'DOI': ['10.1038/nmeth.1358']}}, 'BIBREF4': {'ref_id': 'b4', 'title': 'Integrative analysis of environmental sequences using MEGAN4', 'authors': [{'first': 'D', 'middle': ['H'], 'last': 'Huson', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Mitra', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Weber', 'suffix': ''}, {'first': 'H', 'middle': [], 'last': 'Ruscheweyh', 'suffix': ''}, {'first': 'S', 'middle': ['C'], 'last': 'Schuster', 'suffix': ''}], 'year': 2011, 'venue': 'Genome Research', 'volume': '21', 'issn': '', 'pages': '1552--1560', 'other_ids': {}}, 'BIBREF5': {'ref_id': 'b5', 'title': 'PEMer: a computational framework with simulation-based error models for inferring genomic structural variants from massive paired-end sequencing data', 'authors': [{'first': 'J', 'middle': ['O'], 'last': 'Korbel', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Abyzov', 'suffix': ''}, {'first': 'X', 'middle': ['J'], 'last': 'Mu', 'suffix': ''}, {'first': 'N', 'middle': [], 'last': 'Carriero', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Cayting', 'suffix': ''}, {'first': 'Z', 'middle': [], 'last': 'Zhang', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Snyder', 'suffix': ''}, {'first': 'M', 'middle': ['B'], 'last': 'Gerstein', 'suffix': ''}], 'year': 2009, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'Fast gapped-read alignment with Bowtie 2', 'authors': [{'first': 'B', 'middle': [], 'last': 'Langmead', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Salzberg', 'suffix': ''}], 'year': 2012, 'venue': 'Nature Methods', 'volume': '9', 'issn': '', 'pages': '357--359', 'other_ids': {}}, 'BIBREF8': {'ref_id': 'b8', 'title': 'Accurate and fast estimation of taxonomic profiles from metagenomic shotgun sequences', 'authors': [{'first': 'B', 'middle': [], 'last': 'Liu', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Gibbons', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Ghodsi', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Treangen', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Pop', 'suffix': ''}], 'year': 2011, 'venue': 'BMC Genomics', 'volume': '12', 'issn': '2', 'pages': '', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'MetaSim: a sequencing simulator for genomics and metagenomics', 'authors': [{'first': 'D', 'middle': ['C'], 'last': 'Richter', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Ott', 'suffix': ''}, {'first': 'A', 'middle': ['F'], 'last': 'Auch', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Schmid', 'suffix': ''}, {'first': 'D', 'middle': ['H'], 'last': 'Huson', 'suffix': ''}], 'year': 2008, 'venue': 'PLoS One', 'volume': '3', 'issn': '10', 'pages': '', 'other_ids': {'DOI': ['10.1371/journal.pone.0003373']}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Genomic variation landscape of the human gut microbiome', 'authors': [{'first': 'S', 'middle': [], 'last': 'Schloissnig', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Arumugam', 'suffix': ''}, {'first': 'S', 'middle': [], 'last': 'Sunagawa', 'suffix': ''}, {'first': 'M', 'middle': [], 'last': 'Mitreva', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Tap', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Zhu', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Waller', 'suffix': ''}, {'first': 'D', 'middle': ['R'], 'last': 'Mende', 'suffix': ''}, {'first': 'J', 'middle': ['R'], 'last': 'Kultima', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Martin', 'suffix': ''}, {'first': 'K', 'middle': [], 'last': 'Kota', 'suffix': ''}, {'first': 'S', 'middle': ['R'], 'last': 'Sunyaev', 'suffix': ''}, {'first': 'G', 'middle': ['M'], 'last': 'Weinstock', 'suffix': ''}, {'first': 'P', 'middle': [], 'last': 'Bork', 'suffix': ''}], 'year': 2013, 'venue': 'Nature', 'volume': '493', 'issn': '7430', 'pages': '45--50', 'other_ids': {'DOI': ['10.1038/nature11711']}}}, 'ref_entries': {'FIGREF0': {'text': 'Relationship of reported value for each program (horizontal axis, log scale) to the empirically-determined Positive Predictive Value (PPV), shown on the vertical axis. While the exact values depend on the test data used, the general values at significant cutoff values (0.8, 0.9, 0.95 PPV) remain relatively constant across different datasets (data not shown).', 'latex': None, 'type': 'figure'}, 'FIGREF1': {'text': 'The Positive Predictive Value (PPV, vertical axis) is shown for each organism (boxes on right), at each level of known abundance (horizontal axis, see', 'latex': None, 'type': 'figure'}, 'TABREF0': {'text': '. Summary of methods for metagenomic classification.', 'latex': None, 'type': 'table'}, 'TABREF1': {'text': 'Table 2. The abundance of each target organism in each set of simulated datasets. Each set is indicated by the number in the top row, and was generated with 50 replicates.', 'latex': None, 'type': 'table'}, 'TABREF2': {'text': 'Segata N, Waldron L, Ballarini A, Narasimhan V,Jousson O, and Huttenhower C (2012). Metagenomic microbial community profiling using unique clade-specific marker genes. NatureMethods, 9(8):811-4. doi: 10.1038/nmeth.2066. Sunagawa S, Mende DR, Zeller G, Izquierdo-Carrasco F, Berger SA, Kultima JR, Coelho LP, Arumugam M, Tap J, Nielsen HB, Rasmussen S, Brunak S, Pedersen O, Guarner F, de Vos WM, Wang J, Li J, Doré J, Ehrlich SD, Stamatakis A & Bork P (2013). Metagenomic species profiling using universal phylogenetic marker genes. Nature Methods, 10, 1196-1199 (2013) doi:10.1038/nmeth.2693 Wood DE and Salzberg SL. Ultrafast metagenomic sequence classification using exact alignments. In submission. Wu GD, Chen J, Hoffmann C, Bittinger K, Chen YY, Keilbaugh SA, Bewtra M, Knights D, Walters WA, Knight R, Sinha R, Gilroy E, Gupta K, Baldassano R, Nessel L, Li H, Bushman FD, Lewis JD (2011). Linking long-term dietary patterns with gut microbial enterotypes. Science. 334(6052):105-8. doi: 10.1126/science.1208344. The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Appendix 2: Viral Database Abaca bunchy top virus DNA-C Abaca bunchy top virus DNA-M Abaca bunchy top virus DNA-The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101Circovirus-like genome BBC-A Circovirus-like genome CB-A Circovirus-like genome CB-B Circovirus-like genome RW-A Circovirus-like genome RW-B Circovirus-like genome RW-C Circovirus-like genome RW-D Circovirus-like genome RW-E Circovirus-like genome SAR-A Circovirus-like genome SAR-B Citrus psorosis virus RNA1 Citrus psorosis virus RNA2 Citrus psorosis virus RNA3The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprintThe copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprintThe copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder.It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Thermovirga lienii DSM 17291 . CC-BY-NC-ND 4.0 International license is made available under a The copyright holder for this preprint (which was not peer-reviewed) is the author/funder. It . https://doi.org/10.1101/001727 doi: bioRxiv preprint Thermus scotoductus SA 01 Thermus sp CCB US3 UF1 Thermus thermophilus HB27 Vibrio cholerae O1 biovar El Tor str N16961 Vibrio fischeri ES114 Vibrio furnissii NCTC 11218 Vibrio harveyi ATCC BAA 1116 . CC-BY-NC-ND 4.0 International license is made available under a', 'latex': None, 'type': 'table'}}, 'back_matter': []}\n" + "Meta count:52398 biorxiv papers count: 867\n" ] } ], "source": [ - "import json\n", - "\n", - "with open('f056da9c64fbf00a4645ae326e8a4339d015d155.json') as f:\n", - " data = json.load(f)\n", - "\n", - "print(data)" + "print(f'Meta count:{len(meta)} biorxiv papers count: {len(bio)}')" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "bio_clean = bio.drop_duplicates().dropna()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])" + "0" ] }, - "execution_count": 25, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data.keys()" + "len(bio_clean)" ] }, { - "cell_type": "code", - "execution_count": 26, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Abstract \n", - " Next-generation sequencing is increasingly being used to study samples composed of mixtures of organisms, such as in clinical applications where the presence of a pathogen at very low abundance may be highly important. We present an analytical method (SIANN: Strain Identification by Alignment to Near Neighbors) specifically designed to rapidly detect a set of target organisms in mixed samples that achieves a high degree of species-and strain-specificity by aligning short sequence reads to the genomes of near neighbor organisms, as well as that of the target. Empirical benchmarking alongside the current state-of-the-art methods shows an extremely high Positive Predictive Value, even at very low abundances of the target organism in a mixed sample. SIANN is available as an Illumina BaseSpace app, as well as through Signature Science, LLC. SIANN results are presented in a streamlined report designed to be comprehensible to the non-specialist user, providing a powerful tool for rapid species detection in a mixed sample. By focusing on a set of (customizable) target organisms and their near neighbors, SIANN can operate quickly and with low computational requirements while delivering highly accurate results.\n" - ] - } - ], "source": [ - "for entry in data['abstract']:\n", - " print(entry['section'],\"\\n\",entry['text'])" + "Using the above dropna condition doesn't work out, number of rows in the resulting dataframe is zero.\n", + "We found a solution Cite: [stack overflow post](https://stackoverflow.com/questions/39241346/pandas-dropna-on-specify-attribute)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ - "def load_json(filename):\n", - " with open(filename + \".json\") as f:\n", - " data = json.load(f)\n", - " return data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Code to read all json files in the directory in one go (Cite: [Kernel on Kaggle](https://www.kaggle.com/maksimeren/covid-19-literature-clustering))" + "bio_clean = bio.loc[~(bio.sha.isnull())]" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 115, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "767" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Reading all files from HDFS" + "len(bio_clean)" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 116, "metadata": {}, "outputs": [ { @@ -2728,681 +1046,992 @@ " \n", " \n", " \n", - " 0\n", - " xqhn0vbp\n", - " 1e1286db212100993d03cc22374b624f7caee956\n", - " PMC\n", - " Airborne rhinovirus detection and effect of ul...\n", - " 10.1186/1471-2458-3-5\n", - " PMC140314\n", - " 12525263.0\n", - " no-cc\n", - " BACKGROUND: Rhinovirus, the most common cause ...\n", - " 2003-01-13\n", - " Myatt, Theodore A; Johnston, Sebastian L; Rudn...\n", - " BMC Public Health\n", + " 31175\n", + " vho70jcx\n", + " f056da9c64fbf00a4645ae326e8a4339d015d155\n", + " biorxiv\n", + " SIANN: Strain Identification by Alignment to N...\n", + " 10.1101/001727\n", + " NaN\n", + " NaN\n", + " biorxiv\n", + " Next-generation sequencing is increasingly bei...\n", + " 2014-01-10\n", + " Samuel Minot; Stephen D Turner; Krista L Ternu...\n", + " NaN\n", " NaN\n", " NaN\n", " True\n", - " True\n", - " custom_license\n", - " https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...\n", + " False\n", + " biorxiv_medrxiv\n", + " https://doi.org/10.1101/001727\n", " \n", " \n", - " 1\n", - " gi6uaa83\n", - " 8ae137c8da1607b3a8e4c946c07ca8bda67f88ac\n", - " PMC\n", - " Discovering human history from stomach bacteria\n", - " 10.1186/gb-2003-4-5-213\n", - " PMC156578\n", - " 12734001.0\n", - " no-cc\n", - " Recent analyses of human pathogens have reveal...\n", - " 2003-04-28\n", - " Disotell, Todd R\n", - " Genome Biol\n", + " 31176\n", + " i9tbix2v\n", + " daf32e013d325a6feb80e83d15aabc64a48fae33\n", + " biorxiv\n", + " Spatial epidemiology of networked metapopulati...\n", + " 10.1101/003889\n", + " NaN\n", + " NaN\n", + " biorxiv\n", + " An emerging disease is one infectious epidemic...\n", + " 2014-06-04\n", + " Lin WANG; Xiang Li\n", + " NaN\n", " NaN\n", " NaN\n", " True\n", - " True\n", - " custom_license\n", - " https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...\n", + " False\n", + " biorxiv_medrxiv\n", + " https://doi.org/10.1101/003889\n", " \n", " \n", - " 2\n", - " le0ogx1s\n", + " 31177\n", + " 62gfisc6\n", + " f33c6d94b0efaa198f8f3f20e644625fa3fe10d2\n", + " biorxiv\n", + " Sequencing of the human IG light chain loci fr...\n", + " 10.1101/006866\n", + " NaN\n", + " NaN\n", + " biorxiv\n", + " Germline variation at immunoglobulin gene (IG)...\n", + " 2014-07-03\n", + " Corey T Watson; Karyn Meltz Steinberg; Tina A ...\n", " NaN\n", - " PMC\n", - " A new recruit for the army of the men of death\n", - " 10.1186/gb-2003-4-7-113\n", - " PMC193621\n", - " 12844350.0\n", - " no-cc\n", - " The army of the men of death, in John Bunyan's...\n", - " 2003-06-27\n", - " Petsko, Gregory A\n", - " Genome Biol\n", " NaN\n", " NaN\n", - " False\n", " True\n", - " custom_license\n", - " https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...\n", + " False\n", + " biorxiv_medrxiv\n", + " https://doi.org/10.1101/006866\n", " \n", " \n", - " 3\n", - " fy4w7xz8\n", - " 0104f6ceccf92ae8567a0102f89cbb976969a774\n", - " PMC\n", - " Association of HLA class I with severe acute r...\n", - " 10.1186/1471-2350-4-9\n", - " PMC212558\n", - " 12969506.0\n", - " no-cc\n", - " BACKGROUND: The human leukocyte antigen (HLA) ...\n", - " 2003-09-12\n", - " Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...\n", - " BMC Med Genet\n", + " 31178\n", + " 058r9486\n", + " 4da8a87e614373d56070ed272487451266dce919\n", + " biorxiv\n", + " Bayesian mixture analysis for metagenomic comm...\n", + " 10.1101/007476\n", + " NaN\n", + " NaN\n", + " biorxiv\n", + " Deep sequencing of clinical samples is now an ...\n", + " 2014-07-25\n", + " Sofia Morfopoulou; Vincent Plagnol\n", + " NaN\n", " NaN\n", " NaN\n", " True\n", - " True\n", - " custom_license\n", - " https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...\n", + " False\n", + " biorxiv_medrxiv\n", + " https://doi.org/10.1101/007476\n", " \n", " \n", - " 4\n", - " 0qaoam29\n", - " 5b68a553a7cbbea13472721cd1ad617d42b40c26\n", - " PMC\n", - " A double epidemic model for the SARS propagation\n", - " 10.1186/1471-2334-3-19\n", - " PMC222908\n", - " 12964944.0\n", - " no-cc\n", - " BACKGROUND: An epidemic of a Severe Acute Resp...\n", - " 2003-09-10\n", - " Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine\n", - " BMC Infect Dis\n", + " 31179\n", + " wich35l7\n", + " eccef80cfbe078235df22398f195d5db462d8000\n", + " biorxiv\n", + " Mapping a viral phylogeny onto outbreak trees ...\n", + " 10.1101/010389\n", + " NaN\n", + " NaN\n", + " biorxiv\n", + " Developing methods to reconstruct transmission...\n", + " 2014-11-11\n", + " Stephen P Velsko; Jonathan E Allen\n", + " NaN\n", " NaN\n", " NaN\n", " True\n", - " True\n", - " custom_license\n", - " https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...\n", + " False\n", + " biorxiv_medrxiv\n", + " https://doi.org/10.1101/010389\n", " \n", " \n", "\n", "" ], "text/plain": [ - " cord_uid sha source_x \\\n", - "0 xqhn0vbp 1e1286db212100993d03cc22374b624f7caee956 PMC \n", - "1 gi6uaa83 8ae137c8da1607b3a8e4c946c07ca8bda67f88ac PMC \n", - "2 le0ogx1s NaN PMC \n", - "3 fy4w7xz8 0104f6ceccf92ae8567a0102f89cbb976969a774 PMC \n", - "4 0qaoam29 5b68a553a7cbbea13472721cd1ad617d42b40c26 PMC \n", - "\n", - " title doi \\\n", - "0 Airborne rhinovirus detection and effect of ul... 10.1186/1471-2458-3-5 \n", - "1 Discovering human history from stomach bacteria 10.1186/gb-2003-4-5-213 \n", - "2 A new recruit for the army of the men of death 10.1186/gb-2003-4-7-113 \n", - "3 Association of HLA class I with severe acute r... 10.1186/1471-2350-4-9 \n", - "4 A double epidemic model for the SARS propagation 10.1186/1471-2334-3-19 \n", + " cord_uid sha source_x \\\n", + "31175 vho70jcx f056da9c64fbf00a4645ae326e8a4339d015d155 biorxiv \n", + "31176 i9tbix2v daf32e013d325a6feb80e83d15aabc64a48fae33 biorxiv \n", + "31177 62gfisc6 f33c6d94b0efaa198f8f3f20e644625fa3fe10d2 biorxiv \n", + "31178 058r9486 4da8a87e614373d56070ed272487451266dce919 biorxiv \n", + "31179 wich35l7 eccef80cfbe078235df22398f195d5db462d8000 biorxiv \n", "\n", - " pmcid pubmed_id license \\\n", - "0 PMC140314 12525263.0 no-cc \n", - "1 PMC156578 12734001.0 no-cc \n", - "2 PMC193621 12844350.0 no-cc \n", - "3 PMC212558 12969506.0 no-cc \n", - "4 PMC222908 12964944.0 no-cc \n", + " title doi \\\n", + "31175 SIANN: Strain Identification by Alignment to N... 10.1101/001727 \n", + "31176 Spatial epidemiology of networked metapopulati... 10.1101/003889 \n", + "31177 Sequencing of the human IG light chain loci fr... 10.1101/006866 \n", + "31178 Bayesian mixture analysis for metagenomic comm... 10.1101/007476 \n", + "31179 Mapping a viral phylogeny onto outbreak trees ... 10.1101/010389 \n", "\n", - " abstract publish_time \\\n", - "0 BACKGROUND: Rhinovirus, the most common cause ... 2003-01-13 \n", - "1 Recent analyses of human pathogens have reveal... 2003-04-28 \n", - "2 The army of the men of death, in John Bunyan's... 2003-06-27 \n", - "3 BACKGROUND: The human leukocyte antigen (HLA) ... 2003-09-12 \n", - "4 BACKGROUND: An epidemic of a Severe Acute Resp... 2003-09-10 \n", + " pmcid pubmed_id license \\\n", + "31175 NaN NaN biorxiv \n", + "31176 NaN NaN biorxiv \n", + "31177 NaN NaN biorxiv \n", + "31178 NaN NaN biorxiv \n", + "31179 NaN NaN biorxiv \n", "\n", - " authors journal \\\n", - "0 Myatt, Theodore A; Johnston, Sebastian L; Rudn... BMC Public Health \n", - "1 Disotell, Todd R Genome Biol \n", - "2 Petsko, Gregory A Genome Biol \n", - "3 Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean... BMC Med Genet \n", - "4 Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine BMC Infect Dis \n", + " abstract publish_time \\\n", + "31175 Next-generation sequencing is increasingly bei... 2014-01-10 \n", + "31176 An emerging disease is one infectious epidemic... 2014-06-04 \n", + "31177 Germline variation at immunoglobulin gene (IG)... 2014-07-03 \n", + "31178 Deep sequencing of clinical samples is now an ... 2014-07-25 \n", + "31179 Developing methods to reconstruct transmission... 2014-11-11 \n", "\n", - " Microsoft Academic Paper ID WHO #Covidence has_pdf_parse \\\n", - "0 NaN NaN True \n", - "1 NaN NaN True \n", - "2 NaN NaN False \n", - "3 NaN NaN True \n", - "4 NaN NaN True \n", + " authors journal \\\n", + "31175 Samuel Minot; Stephen D Turner; Krista L Ternu... NaN \n", + "31176 Lin WANG; Xiang Li NaN \n", + "31177 Corey T Watson; Karyn Meltz Steinberg; Tina A ... NaN \n", + "31178 Sofia Morfopoulou; Vincent Plagnol NaN \n", + "31179 Stephen P Velsko; Jonathan E Allen NaN \n", "\n", - " has_pmc_xml_parse full_text_file \\\n", - "0 True custom_license \n", - "1 True custom_license \n", - "2 True custom_license \n", - "3 True custom_license \n", - "4 True custom_license \n", + " Microsoft Academic Paper ID WHO #Covidence has_pdf_parse \\\n", + "31175 NaN NaN True \n", + "31176 NaN NaN True \n", + "31177 NaN NaN True \n", + "31178 NaN NaN True \n", + "31179 NaN NaN True \n", "\n", - " url \n", - "0 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... \n", - "1 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... \n", - "2 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... \n", - "3 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2... \n", - "4 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2... " + " has_pmc_xml_parse full_text_file url \n", + "31175 False biorxiv_medrxiv https://doi.org/10.1101/001727 \n", + "31176 False biorxiv_medrxiv https://doi.org/10.1101/003889 \n", + "31177 False biorxiv_medrxiv https://doi.org/10.1101/006866 \n", + "31178 False biorxiv_medrxiv https://doi.org/10.1101/007476 \n", + "31179 False biorxiv_medrxiv https://doi.org/10.1101/010389 " ] }, - "execution_count": 79, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "client_hdfs = InsecureClient('http://madison:31802')\n", - "with client_hdfs.read('/cord19dataset/metadata.csv', encoding = 'utf-8') as reader:\n", - " df = pd.read_csv(reader)\n", - "\n", - "df.head()" + "bio_clean.head()" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 117, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'directoryCount': 17,\n", - " 'fileCount': 59317,\n", - " 'length': 7913288034,\n", - " 'quota': -1,\n", - " 'spaceConsumed': 23739864102,\n", - " 'spaceQuota': -1,\n", - " 'typeQuota': {}}" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "['biorxiv'] [nan] [nan] [nan] [nan]\n" + ] } ], "source": [ - "client_hdfs.content('/cord19dataset')" + "print(bio_clean['source_x'].unique(),\n", + "bio_clean['pmcid'].unique(),\n", + "bio_clean['pubmed_id'].unique(),\n", + "bio_clean['Microsoft Academic Paper ID'].unique(),\n", + "bio_clean['WHO #Covidence'].unique())" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 118, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['COVID.DATA.LIC.AGMT.pdf',\n", - " 'biorxiv_medrxiv',\n", - " 'biorxiv_medrxiv.tar.gz',\n", - " 'comm_use_subset',\n", - " 'cord19_specter_embeddings_2020-04-10',\n", - " 'custom_license',\n", - " 'json_schema.txt',\n", - " 'metadata.csv',\n", - " 'metadata.readme',\n", - " 'noncomm_use_subset']" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "fnames = client_hdfs.list('/cord19dataset')\n", - "fnames" + "bio_slim = bio_clean.drop(['Microsoft Academic Paper ID','WHO #Covidence','pubmed_id','pmcid'], axis = 1)" ] }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 119, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['0015023cc06b5362d332b3baf348d11567ca2fbb.json',\n", - " '00340eea543336d54adda18236424de6a5e91c9d.json',\n", - " '004f0f8bb66cf446678dc13cf2701feec4f36d76.json',\n", - " '00911cf4f99a3d5ae5e5b787675646a743574496.json',\n", - " '00d16927588fb04d4be0e6b269fc02f0d3c2aa7b.json',\n", - " '00eb9220dc8cd351393b6b035323d350f103f8c6.json',\n", - " '0139ea4ca580af99b602c6435368e7fdbefacb03.json',\n", - " '013d9d1cba8a54d5d3718c229b812d7cf91b6c89.json',\n", - " '018fb5e62fbbcae07d57d94d29ac630dcc4dccf9.json',\n", - " '01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18.json',\n", - " '01e3b313e78a352593be2ff64927192af66619b5.json',\n", - " '02201e4601ab0eb70b6c26480cf2bfeae2625193.json',\n", - " '0255ea4b2f26a51a3bfa3bd8f3e1978c82c976d5.json',\n", - " '029c1c588047f1d612a219ee15494d2d19ff7439.json',\n", - " '0313c3faa16cd66d64f31ae37e40fb70695d69fb.json',\n", - " '033ea7af3e6137df652de026f0751ac435327b75.json',\n", - " '03813d8657ba43ea382788caec2d14257b26d8fd.json',\n", - " '03ce432f27c7df6af22b92245a614db2ecb5de5f.json',\n", - " '03ea3a614b56409d3f099c9ad764864293132540.json',\n", - " '03ea9ad47ebe9a599205b99390c45490e6724024.json',\n", - " '04030bba3035a58c7725ae267973206f6eb6c0b4.json',\n", - " '041bae0a6de2b69979d39460b3f2ee8946534ec2.json',\n", - " '05082393ba4c7ec530190dd887d99c74fd72f6d6.json',\n", - " '052bf4fb7deaf593862991af3b118b5f11a9fbe1.json',\n", - " '0537863d6c059cc9cf5ec8dd0beb27dec5d9d801.json',\n", - " '05d47dd5b46f86428de058db4ecc8bca76a9ad16.json',\n", - " '05d99c07db59b6948e39bfa62c2cbbf62944059a.json',\n", - " '05e37847597676ae715adcec18a6574e75a20546.json',\n", - " '061ffcdd4d674c4d7ce24e4aa7c5037c68596864.json',\n", - " '0624a12abfe85c8b5070850d912a2db4cd453236.json',\n", - " '06837008df793f872a6fb830dfb83c9525edb7c4.json',\n", - " '06a1002f9fbea7179ac3572843f66b14568af6e4.json',\n", - " '06acb8da0009104a2af509334abe3c26b1da66a1.json',\n", - " '06c1b3535b83251cf92c01258b5048beeab7a460.json',\n", - " '06d12dc5ac32d82387c65370d0a600e13059122d.json',\n", - " '073d74442e2655d79b0b3f764a627ec667ad422c.json',\n", - " '07e833d0917cace550853f72923856d0fe1a7120.json',\n", - " '08660499ee722a74043f8417faee3e1eeb9d0f5f.json',\n", - " '08826d0596a01a2a482eb6d80edb0e87cddc304e.json',\n", - " '08911cdc65e71e6398ca79b46806e6c8b2b730ae.json',\n", - " '08a22278486e12768ce186677a6a89663d24586f.json',\n", - " '090b6c8b3df30bc248221869f673a2d970caa1b9.json',\n", - " '091a8e9a61e19e88caeb039f0e3888d111b20439.json',\n", - " '09892e597bdc1ded9eb5922b4f3d41e041d6634a.json',\n", - " '09b6706748f0c1ae0da436ac2dfac9052b84e4ea.json',\n", - " '09c9fcabc66a106e01ef42247cbd86b6d85bd67f.json',\n", - " '09ec8daa8e32168d92d05b86de1784c639685fb4.json',\n", - " '09fd48d0e1f60fa69b68ebd54bd5d71fc08dec96.json',\n", - " '0a27cb2cd52229472fcfc3e49d3a3cb7179867e4.json',\n", - " '0a2a28cb82e7a03af0a9fad4fd4c68c9fdac2477.json']" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "fnames = client_hdfs.list('/cord19dataset/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json')\n", - "fnames[0:50]" + "bio_slim = bio_slim.drop(['url','doi','license','full_text_file'],axis =1 )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ - "##It is possible to loop through all the folders and sub folders while using \n", - "##content(,strict=False) != None as a breaking condition`" + "bio_slim = bio_slim[~bio_slim.isin(['False'])]" ] }, { - "cell_type": "code", - "execution_count": 91, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "59317" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "#There is also this provided snippet to get all files under a given directory\n", - "# Get all files under a given folder (arbitrary depth).\n", - "import posixpath as psp\n", - "fpaths = [\n", - " psp.join(dpath, fname)\n", - " for dpath, _, fnames in client_hdfs.walk('/cord19dataset')\n", - " for fname in fnames\n", - "]\n", - "len(fpaths)" + "As we had seen at the send of section 2.3, all_json contains the paths of all the json files(i.e. the articles) in it.\n", + "Now we should load this into a dataframe for it to be queryable" ] }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ - "fpaths_json = list(filter(lambda x: \".json\" in x,fpaths))" + "for i, j in enumerate(all_json):\n", + " #print(i,j)\n", + " pass" ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 122, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "59311" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 0 of 1934\n", + " time taken : 0:00:01.890942\n" + ] } ], "source": [ - "len(fpaths_json)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reading all files from local disk" + "#loaddict = \n", + "articles = {}\n", + "start_time = datetime.now()\n", + "for i,path in enumerate(all_json):\n", + " if(i % 5000 == 0):\n", + " print(f'Processing {i} of {len(all_json)}')\n", + " try:\n", + " articles[i] = Article(all_json[i])\n", + " except Exception as e:\n", + " continue\n", + "end_time = datetime.now()\n", + "print(f' time taken : {(end_time - start_time)}')" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 123, "metadata": {}, + "outputs": [], "source": [ - "Since our dataset is not prohibitively large(7-8GB) it is easilty possible to reado it from disk as shown below" + "#print(articles[0].paper_id,\"\\n\\n\",articles[0].abstract,\"\\n\\n\", articles[0].body_text)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "59311" + "1934" ] }, - "execution_count": 28, + "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)\n", - "len(all_json)" + "len(articles)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#all_json" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 126, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing index: 0 of 1934\n", + "Processing index: 193 of 1934\n", + "Processing index: 386 of 1934\n", + "Processing index: 579 of 1934\n", + "Processing index: 772 of 1934\n", + "Processing index: 965 of 1934\n", + "Processing index: 1158 of 1934\n", + "Processing index: 1351 of 1934\n", + "Processing index: 1544 of 1934\n", + "Processing index: 1737 of 1934\n", + "Processing index: 1930 of 1934\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paper_idabstractbody_textauthorstitle
00015023cc06b5362d332b3baf348d11567ca2fbbAbstractThe positive stranded RNA genomes of p...VP3, and VP0 (which is further processed to VP...Joseph C. Ward. Lidia Lasecka-Dykes...(The RNA pseudoknots in foot-and-mouth disease...
100340eea543336d54adda18236424de6a5e91c9dDuring the past three months, a new coronaviru...In December 2019, a novel coronavirus, SARS-Co...Carla Mavian. Simone Marini...(Regaining perspective on SARS-CoV-2 molecular...
2004f0f8bb66cf446678dc13cf2701feec4f36d76We integrate the human movement and healthcare...The 2019-nCoV epidemic has spread across China...Hanchu Zhou. Jianan Yang...(Healthcare-resource-adjusted vulnerabilities ...
3005d189d5bd7ac01aee65e934fd3d5186a3f7b27The rapid outbreak of the new Coronavirus (COV...The outbreak of infectious diseases has always...Behzad Pirouz. Amirsina Golmohammadi...(Relationship between Average Daily Temperatur...
400911cf4f99a3d5ae5e5b787675646a743574496ABSTRARCTThe fast accumulation of viral metage...Metagenomic sequencing, which allows us to dir...Jiayu Shang. Yanni Sun(CHEER: hierarCHical taxonomic classification ...
\n", + "
" + ], "text/plain": [ - "'/s/chopin/b/grad/sanketm/cord19dataset/noncomm_use_subset/noncomm_use_subset/pdf_json/e6d882be4961d1bdd7507b4a29d86b650de0895d.json'" + " paper_id \\\n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", + "\n", + " abstract \\\n", + "0 AbstractThe positive stranded RNA genomes of p... \n", + "1 During the past three months, a new coronaviru... \n", + "2 We integrate the human movement and healthcare... \n", + "3 The rapid outbreak of the new Coronavirus (COV... \n", + "4 ABSTRARCTThe fast accumulation of viral metage... \n", + "\n", + " body_text \\\n", + "0 VP3, and VP0 (which is further processed to VP... \n", + "1 In December 2019, a novel coronavirus, SARS-Co... \n", + "2 The 2019-nCoV epidemic has spread across China... \n", + "3 The outbreak of infectious diseases has always... \n", + "4 Metagenomic sequencing, which allows us to dir... \n", + "\n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", + "\n", + " title \n", + "0 (The RNA pseudoknots in foot-and-mouth disease... \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... \n", + "3 (Relationship between Average Daily Temperatur... \n", + "4 (CHEER: hierarCHical taxonomic classification ... " ] }, - "execution_count": 29, + "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_json[0]" + "dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': []}\n", + "for idx, entry in enumerate(all_json):\n", + " if idx % (len(all_json) // 10) == 0:\n", + " print(f'Processing index: {idx} of {len(all_json)}')\n", + " \n", + " try:\n", + " content = Article(entry)\n", + " except Exception as e:\n", + " continue\n", + " \n", + " # get metadata information\n", + " meta_data = meta.loc[meta['sha'] == content.paper_id]\n", + " # no metadata, skip this paper\n", + " if len(meta_data) == 0:\n", + " continue\n", + "\n", + " dict_['body_text'].append(content.body_text)\n", + " \n", + " \n", + " # get metadata information\n", + " meta_data = meta.loc[meta['sha'] == content.paper_id]\n", + " \n", + " try:\n", + " # if more than one author\n", + " authors = meta_data['authors'].values[0].split(';')\n", + " if len(authors) > 2:\n", + " # more than 2 authors, may be problem when plotting, so take first 2 append with ...\n", + " dict_['authors'].append(\". \".join(authors[:2]) + \"...\")\n", + " else:\n", + " # authors will fit in plot\n", + " dict_['authors'].append(\". \".join(authors))\n", + " except Exception as e:\n", + " # if only one author - or Null valie\n", + " dict_['authors'].append(meta_data['authors'].values[0])\n", + " \n", + " # add the title information, add breaks when needed\n", + " try:\n", + " title = meta_data['title'].values[0], 40\n", + " dict_['title'].append(title)\n", + " # if title was not provided\n", + " except Exception as e:\n", + " dict_['title'].append(meta_data['title'].values[0])\n", + " \n", + " \n", + " dict_['paper_id'].append(meta_data['sha'].values[0])\n", + " dict_['abstract'].append(meta_data['abstract'].values[0])\n", + " \n", + "df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title'])\n", + "df_covid.head()" ] }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "_125json = all_json[0:12500]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, + "execution_count": 127, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1934 1934 1934 1934 1934\n" + ] + } + ], "source": [ - "Let's make a class so that we could refer to the loaded articles easily" + "print(len(dict_['paper_id']),len(dict_['abstract']),len(dict_['body_text']),len(dict_['authors']),len(dict_['title']))" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 128, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "194" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "class Article:\n", - " \n", - " def __init__(self,filepath):\n", - " with open(filepath) as f:\n", - " metadata = json.load(f)\n", - " self.paper_id = metadata['paper_id']\n", - " self.title = metadata['metadata']['title']\n", - " self.abstract = \"\"\n", - " self.body_text = \"\"\n", - " \n", - " #Abstract\n", - " try:\n", - " for entry in metadata['abstract']:\n", - " self.abstract += str(entry['text'])\n", - " except:\n", - " pass\n", - " \n", - " \n", - "\n", - " #body_text\n", - " for entry in metadata['body_text']:\n", - " self.body_text += str(entry['text'])\n", - "\n", - "\n", - " \n", - " def __repr__(self):\n", - " return f'Article Object with id: {self.paper_id} \\n title:{self.title} \\n abstract:{self.abstract}'\n", - " \n", - " " + "len(df_covid['abstract'][0].strip().split())" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 129, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
00015023cc06b5362d332b3baf348d11567ca2fbbAbstractThe positive stranded RNA genomes of p...VP3, and VP0 (which is further processed to VP...Joseph C. Ward. Lidia Lasecka-Dykes...(The RNA pseudoknots in foot-and-mouth disease...1941709
100340eea543336d54adda18236424de6a5e91c9dDuring the past three months, a new coronaviru...In December 2019, a novel coronavirus, SARS-Co...Carla Mavian. Simone Marini...(Regaining perspective on SARS-CoV-2 molecular...1382487
2004f0f8bb66cf446678dc13cf2701feec4f36d76We integrate the human movement and healthcare...The 2019-nCoV epidemic has spread across China...Hanchu Zhou. Jianan Yang...(Healthcare-resource-adjusted vulnerabilities ...34749
3005d189d5bd7ac01aee65e934fd3d5186a3f7b27The rapid outbreak of the new Coronavirus (COV...The outbreak of infectious diseases has always...Behzad Pirouz. Amirsina Golmohammadi...(Relationship between Average Daily Temperatur...2492791
400911cf4f99a3d5ae5e5b787675646a743574496ABSTRARCTThe fast accumulation of viral metage...Metagenomic sequencing, which allows us to dir...Jiayu Shang. Yanni Sun(CHEER: hierarCHical taxonomic classification ...1395153
5009ceadbb6f9f0d597b9516f0b9ca01e40231aadObjective: Describe the changes in patient pop...Since its emergence in December 2019 the novel...Omar Badawi. Xinggang Liu...(Impact of COVID-19 pandemic on severity of il...2921409
600d16927588fb04d4be0e6b269fc02f0d3c2aa7bAbstractInfectious bronchitis (IB) causes sign...Infectious bronchitis (IB), which is caused by...Salman L. Butt. Eric C. Erwood...(Real-time, MinION-based, amplicon sequencing ...2483958
700eb9220dc8cd351393b6b035323d350f103f8c6Importance: As with other traumatic events, pa...Evidence from prior pandemics suggests that, a...Victor M. Castro. Roy H Perlis(Impact of COVID-19 on psychiatric assessment ...2462103
80109d1273b2d59a099ab66cdad6939d5e7fcb2e8In severe cases of coronavirus disease 2019 (C...To date, the coronavirus disease 2019 (COVID-1...Yu Zuo. Srilakshmi Yalavarthi...(Neutrophil extracellular traps (NETs) as mark...2472734
90139ea4ca580af99b602c6435368e7fdbefacb03AbstractBackgroundNipah Virus (NiV) came into ...Nipah is an infectious negative-sense single-s...Nishi Kumari. Ayush Upadhyay...(A Combined Evidence Approach to Prioritize Ni...3262382
\n", + "
" + ], + "text/plain": [ + " paper_id \\\n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", + "5 009ceadbb6f9f0d597b9516f0b9ca01e40231aad \n", + "6 00d16927588fb04d4be0e6b269fc02f0d3c2aa7b \n", + "7 00eb9220dc8cd351393b6b035323d350f103f8c6 \n", + "8 0109d1273b2d59a099ab66cdad6939d5e7fcb2e8 \n", + "9 0139ea4ca580af99b602c6435368e7fdbefacb03 \n", + "\n", + " abstract \\\n", + "0 AbstractThe positive stranded RNA genomes of p... \n", + "1 During the past three months, a new coronaviru... \n", + "2 We integrate the human movement and healthcare... \n", + "3 The rapid outbreak of the new Coronavirus (COV... \n", + "4 ABSTRARCTThe fast accumulation of viral metage... \n", + "5 Objective: Describe the changes in patient pop... \n", + "6 AbstractInfectious bronchitis (IB) causes sign... \n", + "7 Importance: As with other traumatic events, pa... \n", + "8 In severe cases of coronavirus disease 2019 (C... \n", + "9 AbstractBackgroundNipah Virus (NiV) came into ... \n", + "\n", + " body_text \\\n", + "0 VP3, and VP0 (which is further processed to VP... \n", + "1 In December 2019, a novel coronavirus, SARS-Co... \n", + "2 The 2019-nCoV epidemic has spread across China... \n", + "3 The outbreak of infectious diseases has always... \n", + "4 Metagenomic sequencing, which allows us to dir... \n", + "5 Since its emergence in December 2019 the novel... \n", + "6 Infectious bronchitis (IB), which is caused by... \n", + "7 Evidence from prior pandemics suggests that, a... \n", + "8 To date, the coronavirus disease 2019 (COVID-1... \n", + "9 Nipah is an infectious negative-sense single-s... \n", + "\n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", + "5 Omar Badawi. Xinggang Liu... \n", + "6 Salman L. Butt. Eric C. Erwood... \n", + "7 Victor M. Castro. Roy H Perlis \n", + "8 Yu Zuo. Srilakshmi Yalavarthi... \n", + "9 Nishi Kumari. Ayush Upadhyay... \n", + "\n", + " title abstract_word_count \\\n", + "0 (The RNA pseudoknots in foot-and-mouth disease... 194 \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... 138 \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... 34 \n", + "3 (Relationship between Average Daily Temperatur... 249 \n", + "4 (CHEER: hierarCHical taxonomic classification ... 139 \n", + "5 (Impact of COVID-19 pandemic on severity of il... 292 \n", + "6 (Real-time, MinION-based, amplicon sequencing ... 248 \n", + "7 (Impact of COVID-19 on psychiatric assessment ... 246 \n", + "8 (Neutrophil extracellular traps (NETs) as mark... 247 \n", + "9 (A Combined Evidence Approach to Prioritize Ni... 326 \n", + "\n", + " body_word_count \n", + "0 1709 \n", + "1 2487 \n", + "2 749 \n", + "3 2791 \n", + "4 5153 \n", + "5 1409 \n", + "6 3958 \n", + "7 2103 \n", + "8 2734 \n", + "9 2382 " + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "articles = {}\n", - "for i in range(5):\n", - " articles[i] = Article(all_json[i])" + "df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(str(x).strip().split()))\n", + "df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(str(x).strip().split()))\n", + "df_covid.head(10)" ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 130, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'paper_id': '0015023cc06b5362d332b3baf348d11567ca2fbb', 'metadata': {'title': 'The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3', 'authors': [{'first': 'Joseph', 'middle': ['C'], 'last': 'Ward', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Lidia', 'middle': [], 'last': 'Lasecka-Dykes', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Chris', 'middle': [], 'last': 'Neil', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Oluwapelumi', 'middle': [], 'last': 'Adeyemi', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Sarah', 'middle': [], 'last': '', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': '', 'middle': [], 'last': 'Gold', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Niall', 'middle': [], 'last': 'Mclean', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Caroline', 'middle': [], 'last': 'Wright', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Morgan', 'middle': ['R'], 'last': 'Herod', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'David', 'middle': [], 'last': 'Kealy', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Emma', 'middle': [], 'last': '', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Warner', 'middle': [], 'last': '', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Donald', 'middle': ['P'], 'last': 'King', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Tobias', 'middle': ['J'], 'last': 'Tuthill', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'David', 'middle': ['J'], 'last': 'Rowlands', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Nicola', 'middle': ['J'], 'last': '', 'suffix': '', 'affiliation': {}, 'email': ''}, {'first': 'Stonehouse', 'middle': [], 'last': 'A#', 'suffix': '', 'affiliation': {}, 'email': ''}]}, 'abstract': [{'text': 'word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a single large open reading 28 frame flanked by 5′ and 3′ untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 has an unusually large 5′ UTR (1.3 kb) containing five structural domains. These include the 30 internal ribosome entry site (IRES), which facilitates initiation of translation, and the cis-acting 31 replication element (cre). Less well characterised structures are a 5′ terminal 360 nucleotide 32 stem-loop, a variable length poly-C-tract of approximately 100-200 nucleotides and a series of 33 two to four tandemly repeated pseudoknots (PKs). We investigated the structures of the PKs 34 by selective 2′ hydroxyl acetylation analysed by primer extension (SHAPE) analysis and 35 determined their contribution to genome replication by mutation and deletion experiments. 36 SHAPE and mutation experiments confirmed the importance of the previously predicted PK 37 structures for their function. Deletion experiments showed that although PKs are not essential 38', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}, {'text': 'for replication, they provide genomes with a competitive advantage. However, although 39 replicons and full-length genomes lacking all PKs were replication competent, no infectious 40 virus was rescued from genomes containing less than one PK copy. This is consistent with our 41 earlier report describing the presence of putative packaging signals in the PK region. 42 43 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': 'Abstract'}], 'body_text': [{'text': 'VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).', 'cite_spans': [], 'ref_spans': [{'start': 351, 'end': 360, 'text': 'figure 1A', 'ref_id': 'FIGREF50'}], 'section': ''}, {'text': 'The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, probably involving recombination. 89 Between two and four PKs are present in different virus isolates but no strain has been 90 identified with less than two PKs, emphasising their potential importance in the viral life cycle 91 (19, 20) . The presence of PKs has been reported in the 5′ UTR of other picornaviruses such as 92 author/funder. All rights reserved. No reuse allowed without permission. can occur in the absence of PKs at least one is required for wild-type (wt) replication. 104 Furthermore, competition experiments showed that extra copies of PKs conferred a replicative 105 advantage to genomes. Although replicons and full-length genomes lacking PKs were 106 replication-competent, no infectious virus was rescued from genomes containing less than one 107 PK copy. This is consistent with our earlier report describing the presence of putative 108 packaging signals in the PK region (22). 109 110 author/funder. All rights reserved. No reuse allowed without permission. Plasmid construction. 117 The FMDV replicon plasmids, pRep-ptGFP, and the replication-defective polymerase mutant 118 control, 3D-GNN, have already been described (10).', 'cite_spans': [{'start': 469, 'end': 471, 'text': '87', 'ref_id': None}, {'start': 662, 'end': 664, 'text': '89', 'ref_id': None}, {'start': 857, 'end': 861, 'text': '(19,', 'ref_id': None}, {'start': 862, 'end': 865, 'text': '20)', 'ref_id': None}, {'start': 1117, 'end': 1120, 'text': '104', 'ref_id': None}, {'start': 1637, 'end': 1640, 'text': '117', 'ref_id': None}], 'ref_spans': [], 'section': '70'}, {'text': 'To introduce mutations into the PK region, the pRep-ptGFP replicon plasmid was digested 121 with SpeI and KpnI and the resulting fragment inserted into a sub-cloning vector (pBluescript) 122 to create the pBluescript PK. PKs 3 and 4 were removed by digestion with HindIII and AatII 123 before insertion of a synthetic DNA sequence with PK 3 and 4 deleted. PKs 2, 3 and 4 were 124 deleted by PCR amplification using ΔPK 234 Forward primer and FMDV 1331-1311 reverse 125 primer, the resultant product was digested with HindIII and AatII and ligated into the 126 pBluescript PK vector. Complete PK deletion was achieved by introduction of an AflII site at 127 the 3′ end of the poly-C tract by PCR mutagenesis to create the sub-cloning vector, pBluescript 128 C11, which was then used to remove all the PKs by PCR mutagenesis using ΔPK 1234 forward 129 primer and FMDV 1331-1311 reverse primer. The modified PK sequences were removed from 130 the sub-cloning vectors and inserted into the pRep-ptGFP plasmid using NheI-HF and KpnI-131 HF.', 'cite_spans': [], 'ref_spans': [], 'section': '120'}, {'text': '132 133 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '120'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint 7 Mutations to disrupt and reform PK structure were introduced using synthetic DNA by 134 digestion with AflII and AatII and ligation into a similarly digested pBluescript PK vector.', 'cite_spans': [], 'ref_spans': [], 'section': '120'}, {'text': 'Mutations were then introduced into the replicon plasmid as described above.', 'cite_spans': [], 'ref_spans': [], 'section': '135'}, {'text': 'To assess the effects of truncation of the poly-C-tract on replication the entire sequence was 137 removed. This was performed by PCR mutagenesis using primers C0 SpeI, and FMDV 1331- In vitro transcription. 143 In vitro transcription reactions for replicon assays were performed as described previously (28).', 'cite_spans': [{'start': 208, 'end': 211, 'text': '143', 'ref_id': None}], 'ref_spans': [], 'section': '136'}, {'text': 'Transcription reactions to produce large amounts of RNA for SHAPE analysis were performed 145 with purified linear DNA as described above, and 1 μg of linearised DNA was then used in a 146 HiScribe T7 synthesis kit (NEB), before DNase treatment and purification using a PureLink FastQ files were quality checked using FastQC with poor quality reads filtered using the 225 Sickle algorithm. Host cell reads were removed using FastQ Screen algorithm and FMDV 226 reads assembled de novo into contigs using IDBA-UD (35). Contigs that matched the FMDV 227 library (identified using Basic Local ALighnment Search Tool (BLAST)) were assembled 228 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [{'start': 368, 'end': 371, 'text': '225', 'ref_id': None}], 'ref_spans': [], 'section': '144'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint into consensus sequences using SeqMan Pro software in the DNA STAR Lasergene 13 229 package (DNA STAR) (36). The SHAPE data largely agreed with the predicted structures with the stems of PK 1, 2 and 3, interacting nucleotides showed little to no reactivity, suggesting NMIA could not interact with 300 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '144'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint 14 these nucleotides either due to the predicted base pairing or steric hindrance (figure 2B). The', 'cite_spans': [], 'ref_spans': [], 'section': '144'}, {'text': 'NMIA reactivity for the interacting nucleotides in the stem-loops with downstream residues of 302 PK 1, 2 and 3 again largely agreed with the predicted structure, although the SHAPE data 303 suggests that there might be fewer interactions than previously predicted. However, differences 304 here could be due to heterogeneity in the formation of PKs in this experiment. The evidence 305 for loop-downstream interaction was weaker for PK4. The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint', 'cite_spans': [{'start': 187, 'end': 190, 'text': '303', 'ref_id': None}], 'ref_spans': [], 'section': '301'}, {'text': 'orientation. 351 Since removal of all four PKs resulted in a significant decrease in replication, the minimal 352 requirements to maintain wt levels of replication were investigated. As near wt level of 353 replication was observed when only one PK was present, all further mutagenesis was 354 performed in a C11 replicon plasmid containing only PK 1. In addition, the orientation of PK 1 was reversed by \"flipping\" the nucleotide sequence to 367 potentially facilitate hybridisation of the loop with upstream rather than downstream sequences.', 'cite_spans': [{'start': 13, 'end': 16, 'text': '351', 'ref_id': None}], 'ref_spans': [], 'section': 'Function of the PKs in replication is dependent on downstream interactions and 350'}, {'text': 'Changing the orientation of the PK reduced replicon replication to a similar level seen in the replication decreased until at passage three there is a 2.5 fold reduction compared to that of 398 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '368'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint passage 0 (figure 5B). Therefore, it appears that replicons with a single PK are at a competitive 399 disadvantage compared to those with two or more. The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint 20 of infectious virus despite being able to replicate after transfection into cells, is consistent with 448 a requirement for RNA structure within the PK region being required for virus assembly. The 5′ UTR of FMDV is unique amongst picornaviruses due to its large size and the presence 454 of multiple RNA elements, some of which still have unknown function. One of these features 455 is a series of repeated PKs varying in number from 2-4, depending on virus strain. In this study, 456 we sequentially deleted or mutated the PKs to help understand their role in the viral life cycle. 457 We also confirmed the predicted PK structures by SHAPE mapping, although there may be Although all viruses isolated to date contain at least two PKs, replicons or viruses containing a 464 single PK were still replication competent. However, replicons with more than a single PK 465 were found to have a competitive advantage over replicons with a single PK when sequentially 466 passaged. Replicons lacking all PKs displayed poor passaging potential even when co-467 transfected with yeast tRNA, reinforcing the observation of a significant impact in replication.', 'cite_spans': [{'start': 920, 'end': 923, 'text': '456', 'ref_id': None}, {'start': 1022, 'end': 1025, 'text': '457', 'ref_id': None}], 'ref_spans': [], 'section': '368'}, {'text': 'Moreover, viruses recovered from genomes with reduced numbers of PKs were slower growing 469 and produced smaller plaques. In addition, these differences were more pronounced in more PKs is functionally competent as no differences was seen between replicons congaing a single 472 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '468'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint 21 copy of PK1 or PK4. This observation is consistent with a previous report of deletion of PK1, 473 along with the Poly-C-tract, with no adverse effect in viral replication (37). This also supports 474 our findings that the truncation of the Poly-C-tract to create the C11 construct had no effect on 475 replicon replication in the cell lines tested. As has been described with Mengo virus, it is 476 possible that the role of the poly-C-tract is essential in other aspects of the viral lifecycle which 477 cannot be recapitulated in a standard tissue culture system (39).', 'cite_spans': [{'start': 443, 'end': 446, 'text': '475', 'ref_id': None}], 'ref_spans': [], 'section': '468'}, {'text': 'The presence of at least two PKs in all viral isolates sequenced so far suggests that multiple 480 PKs confer a competitive advantage in replication. Here we showed by sequential passage that 481 replicons containing at least two PKs were maintained at a level similar to wt, but replicons 482 containing only one PK showed a persistent decline. It is unclear why some viral isolates 483 contain two, three or four PKs is still unknown, but this may be stochastic variation or may 484 reflect subtle effects of host range or geographical localisation. The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '479'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint Significance is shown comparing the replication of C11 PK disrupt and C11 PK restore (Aii). Significance shown is compared to wt replicon. Error bars are calculated by SEM, n = 3, * P 673 < 0.05, **** P < 0.0001. 674 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '479'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint 33 675 author/funder. All rights reserved. No reuse allowed without permission.', 'cite_spans': [], 'ref_spans': [], 'section': '479'}, {'text': 'The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint ', 'cite_spans': [], 'ref_spans': [], 'section': '479'}], 'bib_entries': {'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic economy in 598 picornaviruses: Foot-and-mouth disease virus replication exploits alternative precursor 599 cleavage pathways', 'authors': [{'first': 'T', 'middle': [], 'last': 'Jackson', 'suffix': ''}, {'first': 'T', 'middle': ['J'], 'last': 'Tuthill', 'suffix': ''}, {'first': 'D', 'middle': ['J'], 'last': 'Rowlands', 'suffix': ''}, {'first': 'N', 'middle': ['J'], 'last': 'Stonehouse', 'suffix': ''}], 'year': 2017, 'venue': 'PLOS Pathog', 'volume': '13', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF2': {'ref_id': 'b2', 'title': 'A universal protocol to 602 generate consensus level genome sequences for foot-and-mouth disease virus and other 603 positive-sense polyadenylated RNA viruses using the Illumina MiSeq', 'authors': [{'first': 'N', 'middle': ['D'], 'last': 'Sanderson', 'suffix': ''}, {'first': 'N', 'middle': ['J'], 'last': 'Knowles', 'suffix': ''}, {'first': 'D', 'middle': ['P'], 'last': 'King', 'suffix': ''}, {'first': 'E', 'middle': ['M'], 'last': 'Cottam', 'suffix': ''}], 'year': 2014, 'venue': 'BMC Genomics', 'volume': '604', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF3': {'ref_id': 'b3', 'title': 'Library preparation for highly accurate population 606 sequencing of RNA viruses', 'authors': [{'first': 'A', 'middle': [], 'last': 'Acevedo', 'suffix': ''}, {'first': 'R', 'middle': [], 'last': 'Andino', 'suffix': ''}], 'year': 2014, 'venue': 'Nat Protoc', 'volume': '9', 'issn': '', 'pages': '1760--1769', 'other_ids': {}}, 'BIBREF4': {'ref_id': 'b4', 'title': 'IDBA-UD: a de novo assembler for 608 single-cell and metagenomic sequencing data with highly uneven depth', 'authors': [{'first': 'Y', 'middle': [], 'last': 'Peng', 'suffix': ''}, {'first': 'Hcm', 'middle': [], 'last': 'Leung', 'suffix': ''}, {'first': 'S', 'middle': ['M'], 'last': 'Yiu', 'suffix': ''}, {'first': 'Fyl', 'middle': [], 'last': 'Chin', 'suffix': ''}], 'year': 2012, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF6': {'ref_id': 'b6', 'title': 'Basic local alignment 611 search tool', 'authors': [{'first': 'S', 'middle': ['F'], 'last': 'Altschul', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Gish', 'suffix': ''}, {'first': 'W', 'middle': [], 'last': 'Miller', 'suffix': ''}, {'first': 'E', 'middle': ['W'], 'last': 'Myers', 'suffix': ''}, {'first': 'D', 'middle': ['J'], 'last': 'Lipman', 'suffix': ''}], 'year': 1990, 'venue': 'J Mol Biol', 'volume': '215', 'issn': '', 'pages': '403--410', 'other_ids': {}}, 'BIBREF7': {'ref_id': 'b7', 'title': 'Genetically engineered foot-and-613 mouth disease viruses with poly(C) tracts of two nucleotides are virulent in mice', 'authors': [{'first': 'E', 'middle': [], 'last': 'Rieder', 'suffix': ''}, {'first': 'T', 'middle': [], 'last': 'Bunch', 'suffix': ''}, {'first': 'F', 'middle': [], 'last': 'Brown', 'suffix': ''}, {'first': 'P', 'middle': ['W'], 'last': 'Mason', 'suffix': ''}], 'year': 1993, 'venue': 'J 614 Virol', 'volume': '67', 'issn': '', 'pages': '5139--5184', 'other_ids': {}}, 'BIBREF9': {'ref_id': 'b9', 'title': 'Both cis and trans Activities of Foot-and-Mouth Disease Virus 617 3D Polymerase Are Essential for Viral RNA Replication', 'authors': [{'first': 'N', 'middle': ['J'], 'last': 'Stonehouse', 'suffix': ''}], 'year': 2016, 'venue': 'J Virol', 'volume': '90', 'issn': '', 'pages': '6864--6883', 'other_ids': {}}, 'BIBREF10': {'ref_id': 'b10', 'title': 'Mutational analysis of the 619 mengovirus poly(C) tract and surrounding heteropolymeric sequences', 'authors': [{'first': 'L', 'middle': [], 'last': 'Martin', 'suffix': ''}, {'first': 'G', 'middle': [], 'last': 'Duke', 'suffix': ''}, {'first': 'J', 'middle': [], 'last': 'Osorio', 'suffix': ''}, {'first': 'D', 'middle': [], 'last': 'Hall', 'suffix': ''}, {'first': 'A', 'middle': [], 'last': 'Palmenberg', 'suffix': ''}], 'year': 1996, 'venue': 'J Virol', 'volume': '620', 'issn': '', 'pages': '2027--2031', 'other_ids': {}}, 'BIBREF11': {'ref_id': 'b11', 'title': 'No reuse allowed without permission. The copyright holder for this preprint (which was not peer-reviewed) is the', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1101/2020.01.10.901801']}}, 'BIBREF12': {'ref_id': 'b12', 'title': 'Figure 3. The poly-C-tract is dispensable and only one PK is required for wt replication', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF13': {'ref_id': 'b13', 'title': 'A replicon 650 with entire poly-C-tract removed (C0) was transfected alongside wt, 3D-GNN and C11 651 replicons into BHK-21 cells (B). Replicons with sequentially deleted PKs (ΔPK 34, ΔPK 234 652 and C11 ΔPK 1234) were assayed for replication in BHK', 'authors': [{'first': '3d', 'middle': [], 'last': 'Wt', 'suffix': ''}], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF14': {'ref_id': 'b14', 'title': 'All replication assays were measured by counting the number of GFP 655 author/funder. All rights reserved. No reuse allowed without permission. The copyright holder for this preprint (which was not peer-reviewed) is the', 'authors': [], 'year': None, 'venue': 'Replication of replicon with PK 4 as the sole remaining PK (C11 PK 4)', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1101/2020.01.10.901801']}}, 'BIBREF15': {'ref_id': 'b15', 'title': 'Error bars shown are calculated by SEM, n = 3', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {}}, 'BIBREF17': {'ref_id': 'b17', 'title': 'No reuse allowed without permission. The copyright holder for this preprint (which was not peer-reviewed) is the', 'authors': [], 'year': None, 'venue': '', 'volume': '', 'issn': '', 'pages': '', 'other_ids': {'DOI': ['10.1101/2020.01.10.901801']}}}, 'ref_entries': {'FIGREF0': {'text': 'and-mouth disease virus (FMDV) is a single stranded positive sense RNA virus of the 45 genus Aphthovirus in the family Picornaviridae. It occurs as seven, antigenically diverse 46 serotypes; A, O, C, Asia 1, South African Territories (SAT) 1, 2 and 3. It is the causative agent 47 of foot-and-mouth disease (FMD), a highly contagious disease of cloven-hooved animals 48 affecting most notably cattle, pigs, sheep and goats in addition to wild species such as the 49 African buffalo. Disease outbreaks have serious economic implications resulting from trade 50 restrictions, reduced productivity and the slaughter of infected and at-risk animals (1). The 51 2001 outbreak in the UK caused economic losses of over £8 billion to the tourism and 52 agricultural sectors. Inactivated virus vaccines are used in countries in which FMD is endemic, 53 but these are often strain-specific and provide little cross protection between serotypes (2). 54 Antigenic variation together with the relatively short duration of immunity following 55 vaccination combine to complicate control of the disease (3). In addition, the carrier state, in 56 which asymptomatically infected animals continue to shed virus, contributes to the spread of 57 FMDV (4). An improved understanding of the viral life cycle may be important for the 58 development of improved vaccines and other control measures. 59 60 The FMDV genome (approximately 8.4 kb) consists of a single open reading frame flanked by 61 5′ and 3′ untranslated regions (UTRs) (figure 1A) (5). The translated region encodes both 62 structural and non-structural proteins. The P1 region encodes the capsid structural proteins VP1, 63', 'latex': None, 'type': 'figure'}, 'FIGREF1': {'text': '73 large poly-C tract of variable length (which can be up to 200 nt), a region containing two to 74 four tandemly repeated pseudoknots (PKs), the cis acting replication element (cre) and the 75 internal ribosome entry site (IRES) (5, 11, 12). Of these five structural domains, functions have 76 been ascribed to only two, the cre and IRES. The cre region is involved in uridylation of the 77 RNA primer peptide, VPg (also known as 3B), and the IRES determines the initiation of 78 translation of the viral polyprotein (13, 14). The roles of the S-fragment, the poly-C tract and 79 the PKs in viral replication are not fully elucidated, however recent studies have shown that 80 truncations to the S-fragment can play key roles in the innate immune response to viral 81 infection (15-17). It has also recently been reported that viruses containing a deletion within 82 the pseudoknot region showed an attenuated phenotype in bovine cell lines while remaining 83 unchanged in porcine, suggesting a role for the pseudoknots in viral tropism (18).', 'latex': None, 'type': 'figure'}, 'FIGREF3': {'text': '138 1311 as forward and reverse primers respectively. The PCR product was digested with SpeI 139 and KpnI before ligation into a NheI and KpnI digested wt pRep ptGFP replicon. Sequences of 140 all primers are available upon request.141 142', 'latex': None, 'type': 'figure'}, 'FIGREF5': {'text': 'prepared as above and a sample (12 pmol) was heated to 95 o C for 2 minutes before 151 cooling on ice. RNA folding buffer (100 mM HEPES, 66 mM MgCl2 and 100 mM NaCl) and 152 RNase Out (Invitrogen) was added to the RNA and incubated at 37 o C for 30 minutes. Once 153 folded, RNA was treated with NMIA compound at a final concentration of 5 mM or DMSO as 154 a negative control for 50 minutes at 37 o C. Following incubation, labelled RNA was ethanol 155 precipitated and resuspended in 10 μl 0.5 x TE buffer.', 'latex': None, 'type': 'figure'}, 'FIGREF6': {'text': '10 minutes in a thermocycler. A reverse transcription master mix containing 4 μl first 161 strand buffer, 1 μl 100 mM DTT, 0.5 μl RNase Out, 1 μl Supsercript III (Invitrogen), 1 μl 10 162 mM PCR dNTP mix (Promega) and 0.5 μl RNase free water, was then added to the 163 RNA/primer complex and extension carried out by incubation at 52 o C for 30 minutes.', 'latex': None, 'type': 'figure'}, 'FIGREF7': {'text': ', cDNA:RNA hybrids were disassociated by incubation with 1 μl 4M NaOH at 166 95 o C for 3 minutes before neutralisation with 2 μl 2M HCl. Extended cDNA was ethanol 167 precipitated and resuspended in 40 μl deionized formamide (Thermo Fisher). Sequencing 168 ladders were made similarly using 6 pmol of RNA with the inclusion of 1 μl 10 mM ddCTP in 169 the reverse transcription mix and using a differentially labelled fluorescent primer (either Hex 170 or FAM). 20 μl of sequencing ladder was combined with NMIA or DMSO samples and 171 dispatched on dry ice for capillary electrophoresis (Dundee DNA seq).', 'latex': None, 'type': 'figure'}, 'FIGREF8': {'text': 'was analysed using QuShape and reactivity overlaid onto the RNA 174 structure using VARNA (29, 30).', 'latex': None, 'type': 'figure'}, 'FIGREF9': {'text': 'in all cell lines was assessed in 24-well plates with 0.5 µg/cm 2 of RNA 178 using Lipofectin transfection reagent (Life Technologies) as previously described (28). For 179 complementation assays, BHK-21 cells seeded into 24-well plates were allowed to adhere for 180 16 hours before transfection with 1 µg of replicon RNA using Lipofectin. Each transfection 181 was performed in duplicate and experiments were biologically repeated. Replicon replication by live cell imaging using an IncuCyte Zoom Dual colour FLR, an automated 183 phase-contrast and fluorescence microscope within a humidifying incubator. At hourly 184 intervals up to 24 hours post transfection, images of each well were taken and used to count 185 the number of ptGFP positive cells per well.', 'latex': None, 'type': 'figure'}, 'FIGREF10': {'text': 'competition assays was performed by co-transfecting BHK-21 cells with in vitro 188 transcribed replicon RNA and harvesting total cell RNA at 8 hours post transfection using 189 TRIzol reagent (Thermo Fisher Scientific). The harvested RNA was then purified using the 190 Direct-zol RNA MiniPrep kit (Zymo Research) with on-column DNase I treatment and eluted 191 in DEPC treated water. The purified passaged RNA (1 µg) was transfected onto the naïve BHKhere are based on plasmid T7S3 which encodes a full length infectious copy 196 of FMDV O1 Kaufbeuren (31). The reporter was removed from replicons by digestion with 197 PsiI and XmaI restriction enzymes and replaced with the corresponding fragment from pT7S3 198 encoding the capsid proteins. Full length viral RNA was transcribed using a T7 MEGAscript 199 kit (Thermo Fisher Scientific), DNase treated using TurboDNase (Thermo Fisher Scientific) 200 and purified using a MEGAclear Transcription Clean-Up kit (Thermo Fisher Scientific). 201 RNA quality and concentration were determined by denaturing agarose gel electrophoresis 202 and Qubit RNA BR Assay Kit (Thermo Fisher Scientific).', 'latex': None, 'type': 'figure'}, 'FIGREF11': {'text': '-transfection cell lysates were freeze-thawed and clarified by centrifugation.208 Clarified lysate was blind passaged onto naïve BHK-21 cells, this was continued for five 209 rounds of passaging.210 211 Sequencing of recovered virus. 212 Recovered viruses at passage 4, were sequenced using an Illumina Miseq (illumine) using a 213 modified version of a previously described PCR-free protocol ((32, 33)). Total RNA was 214 extracted from clarified passage 4 lysates using TRizol reagent (Thermo Fisher Scientific) 215 and residual genomic DNA removed using DNA-free DNA removal Kit (Thermo Fisher 216 Scientific). RNA was precipitated using 3 M sodium acetate and ethanol, 10 ul of purified 217 RNA (containing 1 pg to 5 µg) of RNA was used in a reverse transcription reaction as 218 previously described (33, 34). Following reverse transcription cDNA was purified and 219 quantified using a Qubit ds DNA HS Assay kit (Thermo Fisher Scientific) and a cDNA 220 library prepared using Nextera XT DNA Sample Preparation Kit (Illumina). Sequencing was 221 carried out on the MiSeq platform using MiSeq Reagent Kit v2 (300 cycles) chemistry 222 (Illumina).', 'latex': None, 'type': 'figure'}, 'FIGREF13': {'text': 'of recovered virus. 232 Confluent BHK-21 cell monolayers were infected with 10-fold serial dilutions of virus stock, 233 overlaid with Eagle overlay media supplemented with 5 % tryptose phosphate broth solution 234 (Sigma Aldrich), penicillin (100 units/ml and streptomycin (100 µg/ml) (Sigma Aldrich) and 235 0.6 % Indubiose (MP Biomedicals) and incubated for 48 hours at 37 o C. Cells were fixed and 236 stained with 1 % (w/v) methylene blue in 10 % (v/v) ethanol and 4 % formaldehyde in PBS. 237 238 Fixed plaques were scanned and images measured using a GNU Image Manipulation 239 Program IMP (GIMP, available at https://www.gimp.org). For each plaque, horizontal and 240 vertical diameter in pixels was taken and an average of these two values was calculated. All 241 plaques per well were measured. 242 243 Cell killing assays. 244 Virus titre was determined by plaque assays. BHK-21 cells were seeded with 3 x10 4 245 cells/well in 96 well plates and allowed to settle overnight. Cell monolayers were inoculated 246 with each rescued virus at MOI of 0.01 PFU for 1 hour, inoculum was removed and 150 µl of 247 fresh GMEM (supplemented with 1 % FCS) was added to each well. Appearance of CPE was 248 monitored every 30 minutes using the IncuCyte S3.', 'latex': None, 'type': 'figure'}, 'FIGREF14': {'text': 'protein 3A, and a goat anti-Mouse IgG (H+L) highly cross-adsorbed 254 secondary antibody, Alexa Fluor 488 (Life Technologies). Each transcript was transfected in 255 triplicate and the experiment biologically repeated three times. BHK-21 cells were seeded 256 into T25 flasks 16 hours prior to transfection with 10 µg RNA. The transfection mix was left 257 on the cells for 1 hour before the media was changed to VGM (Glasgow Minimum Essential 258 Medium (Sigma-Aldrich), 1% Foetal Bovine Serum -Brazil origin (Life Science Production) 259 and 5% Tryptose Phosphate Broth (Sigma-Aldrich).', 'latex': None, 'type': 'figure'}, 'FIGREF15': {'text': 'After a further 3 hours, cells were dissociated using trypsin-EDTA 0.05% phenol red (Life261 Technologies), pelleted at 200 g for 3 minutes and fixed in 4% paraformaldehyde for 40 262 minutes. Cells were then transferred to a 96-well u-bottom plate and pelleted; this and all 263 subsequent pelleting steps were done at 300 xg for 5 minutes. Cells were resuspended in 0.5% 264 BSA in PBS blocking buffer (Melford), pelleted and resuspended in 1/1000 2C2 antibody and 265 left shaking at 500 rpm at 4 o C for 14 hours in an Eppendorf Thermomixer C plate shaker. The 266 cells were pelleted and subsequently resuspended in blocking buffer three times to wash, 267 resuspended in 1/200 anti-mouse fluorescent secondary antibody and rotated at 500 rpm at 268 24 o C for 1 hour before washing a final three times. Cells were then resuspended in 500 µl PBS 269 and data were collected on the LSR Fortessa (BD Biosciences) using BD FACSDivaTM 270 software. Data were exported as flow cytometry standard (FCS) files, and were analysed in 271 FlowJo 10 using the gating strategy shown in Figure 7.', 'latex': None, 'type': 'figure'}, 'FIGREF16': {'text': 'of PKs was initially predicted in 1987 by computational and visual analysis of 279 the 5′ UTR sequence (12). The prediction of the presence of multiple PKs was strengthened by 280 the observation that variation in the length of this region between different virus isolates 281 equated to the gain or loss of PK-length sequence units. However, the definitive demonstration 282 of PK structure remains a challenge. Here, we used selective 2′ hydroxyl acylation analysed by 283 primer extension (SHAPE) to investigate the secondary structure of the PK region.', 'latex': None, 'type': 'figure'}, 'FIGREF17': {'text': 'representing FMDV UTRs were folded prior to treatment with NMIA, a 286 compound that forms 2′-O-adducts when interacting with non-paired nucleotides, or DMSO as 287 a negative control. Labelled RNAs were purified and used as templates in reverse transcription 288 reactions using fluorescently labelled primers. Elongation of the reverse transcription products 289 terminates at adducts, resulting in cDNA fragments of different lengths, which were analysed 290 by gel electrophoresis alongside a sequencing ladder to identify sites of NMIA interaction. The 291 whole PK region was surprisingly reactive suggesting that it was largely single stranded or 292 highly flexible (figure 2A). To investigate if the SHAPE data agreed with the predicted 293structure, the NMIA reactivity was overlaid onto the previous PK structure prediction (12).', 'latex': None, 'type': 'figure'}, 'FIGREF19': {'text': '296 being unreactive, suggestive of base-pairing. Formation of the stem of PK4 was less convincing, 297 although the stem nucleotides still had relatively low reactivity in agreement with the other PK 298 models. For all the PKs, the nucleotides in the loop regions and the predicted downstream299', 'latex': None, 'type': 'figure'}, 'FIGREF20': {'text': 'the NMIA reactivities with the original predicted structure the SHAPE data were 308 compatible to the PK models and potentially shed new light on the requirements of the loop 309 interactions.', 'latex': None, 'type': 'figure'}, 'FIGREF21': {'text': 'PK is sufficient for efficient replication.312 The replicon system was based on the O1K FMDV sequence which includes four similar but 313 non-identical PKs (figure 1). The PKs were sequentially deleted from the 3′ side (i.e. PK 4-PK 314 1), and replication of the resulting modified replicons assessed.', 'latex': None, 'type': 'figure'}, 'FIGREF22': {'text': 'complete removal of all PKs, an AflII site was inserted into the ptGFP replicon 317 plasmid which resulted in reduction of the poly-C-tract to 11 cytosine residues. This C11 318 replicon was investigated alongside a wt replicon and one with lethal polymerase mutations 319 (3D-GNN). These controls were used to confirm that truncation of the poly-C tract had no 320 measurable effect on replication the two cell lines tested, as previously reported (37) (figure 321 3A). For completeness, we further removed the entire poly-C-tract (C0) and showed that this 322 had no observable negative effect on replication of the replicon (figure 3B). The C11 construct 323 was then used as the \"backbone\" for removal of all four PKs.', 'latex': None, 'type': 'figure'}, 'FIGREF23': {'text': 'measuring ptGFP reporter expression, in parallel with transfection of a wt and 327 3D-GNN replicon, where the 3D-GNN replicon is used to monitor ptGFP expression resulting 328 from translation of input RNA in the absence of replication. Reporter expression was recorded 329 using an IncuCyte Zoom automatic fluorescent microscope and is shown at 8 hours post-', 'latex': None, 'type': 'figure'}, 'FIGREF25': {'text': '∆PK 234 respectively) replicated at similar levels to the wt replicon (figure 3C-D). 334 However, a replicon containing no PKs (C11 ∆PK 1234) showed a significant (~ 4 fold) 335 reduction in replication in BHK-21 cells compared to the wt C11 replicon. A larger reduction 336 in replication (28 fold) was seen in the MDBK cell line, supporting previous publications on 337 the potential role in host cell tropism (18). Replication of the C11 ∆PK 1234 replicon in MDBK 338 cells was however still significantly above that of the 3D-GNN negative control. These data 339 suggest that although the PKs are not essential for replication at least one PK is required for wt 340 levels of replication. 341 342In the experiments above PK1 was the sole remaining PK and we therefore investigated 343 whether other PKs could similarly support wt replication. We deleted all the PKs to create the 344 C11 construct and re-inserted PK4 as the only PK (C11 PK4). Near wt levels of replication 345 were observed following transfection into both cell types suggesting that there is no functional 346 difference between PK1 and PK4 (figure 3E).', 'latex': None, 'type': 'figure'}, 'FIGREF27': {'text': 'to interrupt base pairing and abrogate formation of the PK structure were 357 made in the loop of PK 1 and the corresponding downstream nucleotides. The substitutions 358 (shown in red) created a GAGA motif both in the loop and downstream regions and reduced 359 the replication of the mutated replicon (C11 PK disrupt) equivalent to that of the replicon 360 containing no PKs, thereby supporting the predicted structure (figure 4A). Base pairing 361 potential was then restored by mutation of the relevant nucleotides in the loop and downstream 362 region to GGGG and CCCC respectively. Restoring the interaction using an alternate sequence 363 increased replication significantly compared to the disrupted PK replicon (~ 4 fold), although 364 this was still slightly below that of the wt (~ 0.7 fold decrease) (figure 4A).', 'latex': None, 'type': 'figure'}, 'FIGREF29': {'text': '369absence of PKs (figure 4B). This suggests that the role of the PKs in genome replication is 370 dependent on both sequence, structure and orientation.', 'latex': None, 'type': 'figure'}, 'FIGREF30': {'text': 'studies above suggested that removal of up to three of the four PKs present in the 375 wt sequence had no clear effect on replicon replication, although deletion of all four was 376 significantly detrimental. To investigate whether multiple PKs conferred more subtle 377 advantages for replication than were evident from single round transfection experiments we 378 carried out sequential passages of replicon RNA following transfection of the PK deleted forms 379 in competition with a wt replicon. Different reporter genes (ptGFP or mCherry) were used to 380 distinguish the competing replicons.', 'latex': None, 'type': 'figure'}, 'FIGREF31': {'text': 'ptGFP; wt, ∆PK 34, ∆PK 234 and C11 ∆PK 1234 were co-transfected into 383 BHK-21 cells together with either a wt mCherry replicon or yeast tRNA as a control. The 384 replication of each of the co-transfected replicons was compared by observing ptGFP and 385 mCherry expression over three sequential passages. Passaging was achieved by harvesting total 386 RNA using Trizol-reagent 8 hours post-transfection. Harvested RNA was purified and then re-387 transfected into naïve BHK-21 cells.', 'latex': None, 'type': 'figure'}, 'FIGREF32': {'text': 'transfection of the wt, ∆PK 34 or ∆PK 234 with yeast tRNA as controls showed no 390 differences in replication as expected (Figure 5A). Likewise, when PK mutants were co-391 transfected with a wt replicon after three passages, the number of green fluorescent cells 392 produced by the ∆PK 34 replicon was comparable to that of the wt, suggesting no competitive 393 advantage of four PKs over two. For both, there was a reduction in replication after the first 394 passage but recovery to near that of the original transfection by the third passage. However, 395 when co-transfected with the wt replicon, the ∆PK 234 replicon showed a similar drop in 396 replication in passage two, but showed no subsequent recovery following each passage and397', 'latex': None, 'type': 'figure'}, 'FIGREF33': {'text': 'transfection with the wt mCherry replicon reduced the replication of the C11 ∆PK 1234 402 replicon to background levels as seen when comparing to the yeast tRNA control. By passage 403 two the ptGFP signal of the C11 ∆PK 1234 was no longer detectable, suggesting that this 404 replicon has been out competed (figure 5C). Although the initial replication of C11 ∆PK 1234 405 was greater when co-transfected with yeast tRNA than when in competition with wt mCherry 406 replicon, the ptGFP signal was reduced at passage two and was at background level by passage 407 three (figure 5C). Replication of the mCherry wt replicon was not influenced by co-transfection 408 with the ptGFP constructs (figure 5D), as expected. Together these data suggest that the minor 409 replicative advantage conferred by multiple PKs are quickly compounded over multiple 410 replication cycles to provide a replicative advantage.', 'latex': None, 'type': 'figure'}, 'FIGREF34': {'text': 'a PK is essential for the production of infectious virus413 As replicons lacking all PKs could replicate and replicons with reduced numbers of PKs414 appeared to be at a competitive disadvantage compared to the wt construct, we investigated the 415 consequences of PK manipulation on the complete viral life cycle. The ∆PK 34, ∆PK 234 and 416 C11 ∆PK 1234 mutations were introduced into an FMDV infectious clone by replacement of 417 sequence encoding ptGFP with that encoding the O1K structural proteins. RNA transcripts 418 were transfected into BHK-21 cells alongside a wt O1K viral transcript and blind passaged 5 419 times by transferring the cell supernatant at 24 hours post transfection onto naïve BHK-21 cells.', 'latex': None, 'type': 'figure'}, 'FIGREF36': {'text': '427 rate of CPE (figure 6A) and plaque size (figure 6B-C) of ∆PK 34 and ∆PK 234 when compared 428 to the wt O1K virus. Rate of CPE was monitored by infecting BHK-21 cells with a known MOI 429 (0.01) of recovered virus, cells were then monitored for signs of CPE (shown as a decrease in 430 cell confluency) as measured by an automated imaging platform (Incucyte Zoom). Both ∆PK 431 34 and ∆PK 234 showed delayed onset of CPE with ∆PK 34 being the slowest, initial CPE 432 occurring at approximately 39 hours and 29 hours post infection respectively, compared to the 433 22 hours seen in the wt control. This mirrored plaque assay data where ∆PK 34 displayed a 434 significantly smaller plaque phenotype when compared to the wt control (average of 13.8 pixels 435 compared to 37.4), the slower rate of CPE seen in ∆PK 234 made a small, but not significant 436 difference (average 31.9 pixels).', 'latex': None, 'type': 'figure'}, 'FIGREF37': {'text': '∆PK 1234 produced no infectious virus the ability of the full-length genome lacking 439 PKs to replicate was investigated. BHK-21 cells were transfected with the same RNA 440 transcripts as above alongside additional controls, mock-transfected and transfected with wt 441 and treated with 3 mM GuHCl (a replication inhibitor) as negative controls. Six hours post-442 transfection, cells were harvested, fixed and labelled with an anti-3A antibody and fluorescent 443 secondary antibody. Cells were then analysed using flow cytometry and anti-3A antibody 444 signal used as an indirect measure of genome replication (figure 7). The results were similar to 445 those of the replicon experiments and showed that all the modified virus genomes were able to 446 undergo robust replication. The inability of the C11 ∆PK 1234 genome to support production 447 author/funder. All rights reserved. No reuse allowed without permission.', 'latex': None, 'type': 'figure'}, 'FIGREF39': {'text': '458fewer strong interactions maintaining the PKs than was previously predicted. This may indicate 459 high conformational flexibility of this region of the genome. SHAPE mapping was also 460 supported by mutation of predicted key interactions between nucleotides in the loop and 461 downstream, disruption of which reduced replication to that of the C11 ∆PK 1234 replicon.', 'latex': None, 'type': 'figure'}, 'FIGREF41': {'text': '470 relevant cells lines (i.e. in MDBK cells compared to BHK 21 cells). It is likely that each of the 471', 'latex': None, 'type': 'figure'}, 'FIGREF42': {'text': 'although removal of all four PKs resulted in a significant decrease in replicon and 487 viral genome replication, replication was not abolished, showing that PKs are not essential to 488 support genome replication. However, deletion of all PKs from an infectious clone completely 489 abolished the ability to recover infectious virus. This suggests that the genome lacking all PKs 490 is defective in a function associated with virion assembly and is compatible with our evidence 491 for the presence of a packaging signal in a similar location on the genome to PK1 (22). It is 492 possible that structural flexibility at this site in the genome allows the RNA to adopt alternate 493 conformations with different roles in genome replication and virion assembly. A functional 494 requirement for multiple RNA conformations may explain the relatively weak interactions 495 between nucleotides involved in stabilising the PK motif as observed by SHAPE analysis or 496 by structural prediction.', 'latex': None, 'type': 'figure'}, 'FIGREF43': {'text': 'was supported by funding from the Biotechnology and Biological Sciences Research 503 Council (BBSRC) of the United Kingdom (research grant BB/K003801/1). Additionally, the 504 Pirbright Institute receives grant-aided support from the BBSRC (projects BB/E/I/00007035, 505 BB/E/I/00007036 and BBS/E/I/00007037).', 'latex': None, 'type': 'figure'}, 'FIGREF44': {'text': 'Jones TJD, Rushton J. 2013. The economic impacts of foot and mouth disease 511 -What are they, how big are they and where do they occur? Prev Vet Med 112:161-M, Parida S. 2018. Foot and mouth disease vaccine strain selection: current 514 approaches and future perspectives. Expert Rev Vaccines 17:577-591.', 'latex': None, 'type': 'figure'}, 'FIGREF45': {'text': 'J-H. 2013. Requirements for improved vaccines against foot-and-mouth disease 516 epidemics. Clin Exp Vaccine Res 2:8-18.', 'latex': None, 'type': 'figure'}, 'FIGREF46': {'text': 'C, Eschbaumer M, Rekant SI, Pacheco JM, Smoliga GR, Hartwig EJ, 518 Rodriguez LL, Arzt J. 2016. The Foot-and-Mouth Disease Carrier State Divergence in 519 Cattle. J Virol 90:6344-64.', 'latex': None, 'type': 'figure'}, 'FIGREF47': {'text': \"C, Tulman ER, Delhon G, Lu Z, Carreno A, Vagnozzi A, Kutish GF, Rock 521 DL. 2005. Comparative genomics of foot-and-mouth disease virus. J Virol 79:6487-D. 1997. Dissecting the roles of VP0 cleavage and RNA packaging in 525 picornavirus capsid stabilization: the structure of empty capsids of foot-and-mouth 526 disease virus. J Virol 71:9743-52. 527 7. Gao Y, Sun S-Q, Guo H-C. 2016. Biological function of Foot-and-mouth disease virus 528 non-structural proteins and non-coding elements. Virol J 13:107. 529 8. Herod MR, Gold S, LaseckaDykes L, Wright C, Ward JC, McLean TC, Forrest S, 530 Jackson T, Tuthill TJ, Rowlands DJ, Stonehouse NJ. 2017. Genetic economy in 531 picornaviruses: Foot-and-mouth disease virus replication exploits alternative precursor 532 cleavage pathway. PLOS Pathog 13:e1006666. 533 9. Tulloch F, Pathania U, Luke GA, Nicholson J, Stonehouse NJ, Rowlands DJ, Jackson 534 T, Tuthill T, Haas J, Lamond AI, Ryan MD. 2014. FMDV replicons encoding green 535 fluorescent protein are replication competent. J Virol Methods 209:35-40. 536 10. Herod MR, Tulloch F, Loundras E-A, Ward JC, Rowlands DJ, Stonehouse NJ. 2015. 537 Employing transposon mutagenesis to investigate foot-and-mouth disease virus 538 replication. J Gen Virol 96:3507-3518. 539 11. Mellor EJC, Brown F, Harris TJR. 1985. Analysis of the Secondary Structure of the 540 Poly(C) Tract in Foot-and-Mouth Disease Virus RNAs. J Gen Virol 66:1919-1929. 541 12. Clarke BE, Brown AL, Currey KM, Newton SE, Rowlands DJ, Carroll AR. 1987. 542 Potential secondary and tertiary structure in the genomic RNA of foot and mouth 543 disease virus. Nucleic Acids Res 15:7067-7079. 544 13. Nayak A, Goodfellow IG, Woolaway KE, Birtley J, Curry S, Belsham GJ. 2006. Role 545 of RNA structure and RNA binding activity of foot-and-mouth disease virus 3C 546 protein in VPg uridylylation and virus replication. J Virol 80:9865-75. Kloc A, Diaz-San Segundo F, Schafer EA, Rai DK, Kenney M, de los Santos T, 555 Rieder E. 2017. Foot-and-mouth disease virus 5'-terminal S fragment is required for 556 replication and modulation of the innate immune response in host cells. Virology 557 512:132-143. 558 17. Kloc A, Rai DK, Rieder E. 2018. The roles of picornavirus untranslated regions in 559 infection and innate immunity. Front Microbiol. Frontiers Media S.A. 560 18. Zhu Z, Yang F, Cao W, Liu H, Zhang K, Tian H, Dang W, He J, Guo J, Liu X, Zheng 561 H. 2019. The Pseudoknot Region of the 5' Untranslated Region Is a Determinant of 562 Viral Tropism and Virulence of Foot-and-Mouth Disease Virus. J Virol 93. 563 19. Mohapatra JK, Pawar SS, Tosh C, Subramaniam S, Palsamy R, Sanyal A, Hemadri D, 564 Pattnaik B. 2011. Genetic characterization of vaccine and field strains of serotype A 565 foot-andmouth disease virus from India. Acta Virol 55:349-352. 566 20. Escarmís C, Dopazo J, Dávila M, Palma EL, Domingo E. 1995. Large deletions in the 567 5'-untranslated region of foot-and-mouth disease virus of serotype C. Virus Res 568 35:155-67.\", 'latex': None, 'type': 'figure'}, 'FIGREF48': {'text': 'Carocci M, Bakkali-Kassimi L. 2012. The encephalomyocarditis virus. Virulence', 'latex': None, 'type': 'figure'}, 'FIGREF49': {'text': \". Wutz G, Auer H, Nowotny N, Grosse B, Skern T, Kuechler E. 1996. Equine rhinovirus Xrn1 produce a pathogenic Dengue virus RNA. Elife 3. 576 24. Kieft JS, Rabe JL, Chapman EG. 2015. New hypotheses derived from the structure of 577 a flaviviral Xrn1-resistant RNA: Conservation, folding, and host adaptation. RNA Biol 578 12:1169-77.579 25. Gultyaev AP, Olsthoorn RCL. 2010. A family of non-classical pseudoknots in 580 influenza A and B viruses. RNA Biol 7:125-9. 581 26. Moss WN, Dela-Moss LI, Priore SF, Turner DH. 2012. The influenza A segment 7 582 mRNA 3' splice site pseudoknot/hairpin family. RNA Biol 9:1305-10. 583 27. Plant EP, Dinman JD. 2008. The role of programmed-1 ribosomal frameshifting in 584 coronavirus propagation. Front Biosci 13:4873-81. 585 28. Herod MR, Ferrer-Orta C, Loundras E-A, Ward JC, Verdaguer N, Rowlands DJ, 586 Stonehouse NJ. 2016. Both cis and trans Activities of Foot-and-Mouth Disease Virus 587 3D Polymerase Are Essential for Viral RNA Replication. J Virol 90:6864-6883. 588 29. Karabiber F, McGinnis JL, Favorov O V., Weeks KM. 2013. QuShape: Rapid, 589 accurate, and best-practices quantification of nucleic acid probing information, 590 resolved by capillary electrophoresis. RNA 19:63-73. 591 30. Darty K, Denise A, Ponty Y. 2009. VARNA: Interactive drawing and editing of the 592 RNA secondary structure. Bioinformatics 25:1974-1975. 593 31. King AMQ, Blakemore WE, Ellard FM, Drew J, Stuart DI. 1999. Evidence for the role 594 of His-142 of protein 1C in the acid-induced disassembly of foot-and-mouth disease 595 virus capsids. J Gen Virol 80:1911-1918. 596 32. Herod MR, Gold S, Lasecka-Dykes L, Wright C, Ward JC, McLean TC, Forrest S, 597 author/funder. All rights reserved. No reuse allowed without permission.\", 'latex': None, 'type': 'figure'}, 'FIGREF50': {'text': \"Replicon and PK schematic. Schematic of the FMDV O1K sub-genomic replicon, 627 showing both 5' and 3' untranslated regions (UTRs) together with the RNA structures present 628 in these regions. IRES-driven translation produces a single polyprotein. Here, the structural 629 proteins have been replaced with a green fluorescent reporter, upstream of the non-structural 630 proteins 2A-3D (A). Predicted PK structures, with putative interactions highlighted in hot-pink 631 are shown. Numbers indicate nucleotide positions after the poly-C-tract (B). Sequence 632 alignment of the 4 PKs, with the interacting regions shown in hot-pink and invariant 633 nucleotides represented by asterisk (C).\", 'latex': None, 'type': 'figure'}, 'FIGREF51': {'text': 'SHAPE NMIA reactivity of the PK region. NMIA reactivity at nucleotides 640 following the poly-C-tract (PCT). High reactivity indicates increased chance of the nucleotide 641 being non base-paired at that position (A). NMIA reactivity of each PK overlaid onto the 642 predicted PK structure using VARNA (30). Loop and downstream interactions represent those 643 supported by SHAPE data (B). NMIA reactivity is represented on a colour scale from low 644 (white) to high (red) (n = 4). 645 646 author/funder. All rights reserved. No reuse allowed without permission.', 'latex': None, 'type': 'figure'}, 'FIGREF52': {'text': \"Disrupting the PK structure and reversing the orientation of a PK reduces 663 replication. Cartoon representations of disrupting and restoring mutations made to PK 1, 664 where nucleotides in the bulge of the stem loop and interacting region downstream were 665 mutated to disrupt structure formation 'PK disrupt', or mutated to maintain bulge and 666 downstream interaction but with different nucleotides 'PK restore' (Ai). Replication of PK 667 disrupt and restore mutants were measured by transfection of RNA into BHK-21 cells and 668 shown here at 8 hours post-transfection alongside wt, 3D-GNN and C11 ΔPK 1234 controls.\", 'latex': None, 'type': 'figure'}, 'FIGREF54': {'text': 'Visual representation of the reversing of the nucleotide sequence of PK1 creating the C11 PK 671 Rvs construct (Bi). Replication of PK Rvs at 8 hours post transfection of BHK-21 cells (Bii).', 'latex': None, 'type': 'figure'}, 'FIGREF56': {'text': 'More than 2 PKs provides a replicative advantage in co-transfection', 'latex': None, 'type': 'figure'}, 'TABREF0': {'text': 'encephalomyocarditis virus (EMCV) and equine rhinitis A virus (ERAV) (21, 22). However, in both cases the PKs are located at the 5′ side of the poly-C-tract, making their location in the FMDV genome unique. PKs have been reported to have roles in several aspects of viral replication including splicing 97 (e.g. HIV and influenza), ribosomal frameshifting (e.g. coronaviruses) and RNase protection 98 (e.g. Dengue virus) (23-27). In the work reported here, the role of the PKs in the FMDV life cycle was investigated, together with biochemical probing of PK structures. The combination of both virus and replicon systems allowed us to distinguish effects on genome replication and 101 other aspects of the viral life cycle. Selective mutation within the PK domain and sequential 102 deletion of PKs confirmed the importance of PK structure and that although genome replication', 'latex': None, 'type': 'table'}, 'TABREF1': {'text': \"Materials and MethodsCells lines.BHK-21 cells obtained from the ATCC (LGC Standard) were maintained in Dulbecco's modified Eagle's Medium with glutamine (Sigma-Aldrich) supplemented with 10 % foetal calf serum (FCS), 50 U/ml penicillin and 50 µg/ml streptomycin.\", 'latex': None, 'type': 'table'}, 'TABREF2': {'text': 'Wt, C11, ∆PK 34 and ∆PK 234 constructs all resulted in the production of infectious virus as was expected from the replicon experiments, with no alteration to input sequence. However, the C11 ∆PK 1234, which replicated (albeit to a lesser degree) as a replicon, produced no recoverable infectious virus(Table 1). Interestingly, there were differences noted in both the', 'latex': None, 'type': 'table'}}, 'back_matter': [{'text': 'author/funder. All rights reserved. No reuse allowed without permission.The copyright holder for this preprint (which was not peer-reviewed) is the The copyright holder for this preprint (which was not peer-reviewed) is the . https://doi.org/10.1101/2020.01.10.901801 doi: bioRxiv preprint', 'cite_spans': [], 'ref_spans': [], 'section': 'annex'}]}\n" + "\n", + "RangeIndex: 1934 entries, 0 to 1933\n", + "Data columns (total 7 columns):\n", + "paper_id 1934 non-null object\n", + "abstract 1933 non-null object\n", + "body_text 1934 non-null object\n", + "authors 1934 non-null object\n", + "title 1934 non-null object\n", + "abstract_word_count 1934 non-null int64\n", + "body_word_count 1934 non-null int64\n", + "dtypes: int64(2), object(5)\n", + "memory usage: 105.8+ KB\n" ] } ], "source": [ - "articles = {}\n", - "for i in range(1):\n", - " with client_hdfs.read(fpaths_json[i]) as reader:\n", - " from json import load\n", - " print(load(reader))" + "df_covid.info()" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Middle East Respiratory Syndrome coronavirus (MERS-CoV) was first isolated from a patient with severe pneumonia in 2012. The 2015 Korea outbreak of MERS-CoV involved 186 cases, including 38 fatalities. A total of 83% of transmission events were due to five superspreaders, and 44% of the 186 MERS cases were the patients who had been exposed in nosocomial transmission at 16 hospitals. The epidemic lasted for 2 months and the government quarantined 16,993 individuals for 14 days to control the outbreak. This outbreak provides a unique opportunity to fill the gap in our knowledge of MERS-CoV infection. Therefore, in this paper, we review the literature on epidemiology, virology, clinical features, and prevention of MERS-CoV, which were acquired from the 2015 Korea outbreak of MERS-CoV.'" + "count 1933\n", + "unique 1932\n", + "top Fast testing can help mitigate the coronavirus...\n", + "freq 2\n", + "Name: abstract, dtype: object" ] }, - "execution_count": 33, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "articles[0].abstract" + "df_covid['abstract'].describe(include='all')" ] }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "scrolled": true - }, + "execution_count": 132, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Article Object with id: d0c4236f95ba2b3d5cdc71fde524b26262f05dd3 \n", - " title:Send Orders for Reprints to reprints@benthamscience.net A Brief Review: The Z-curve Theory and its Application in Genome Analysis \n", - " abstract:In theoretical physics, there exist two basic mathematical approaches, algebraic and geometrical methods, which, in most cases, are complementary. In the area of genome sequence analysis, however, algebraic approaches have been widely used, while geometrical approaches have been less explored for a long time. The Z-curve theory is a geometrical approach to genome analysis. The Z-curve is a three-dimensional curve that represents a given DNA sequence in the sense that each can be uniquely reconstructed given the other. The Z-curve, therefore, contains all the information that the corresponding DNA sequence carries. The analysis of a DNA sequence can then be performed through studying the corresponding Z-curve. The Z-curve method has found applications in a wide range of areas in the past two decades, including the identifications of protein-coding genes, replication origins, horizontally-transferred genomic islands, promoters, translational start sides and isochores, as well as studies on phylogenetics, genome visualization and comparative genomics. Here, we review the progress of Z-curve studies from aspects of both theory and applications in genome analysis." + "count 1934\n", + "unique 1934\n", + "top Since SARS-Cov-2 epidemic appeared in Wuhan Ch...\n", + "freq 1\n", + "Name: body_text, dtype: object" ] }, - "execution_count": 34, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "articles[1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data Cleaning" + "df_covid['body_text'].describe(include='all')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have explored the structure of the dataset, let us focus on our areas of interest as mentioned earlier: namely the Title, Abstract and the body of the text.\\\n", - "We know that we can access these for a given paper by scanning the metadata for a paper of choice for its title and abstract, and if we need it, using its `sha` column value to get the full text of the paper." + "The differencre between the unique count and the total count means that there are some duplicate abstracts.\n", + "It is also possible that there are blank abstracts that are being counted as the same. " ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 133, "metadata": {}, + "outputs": [], "source": [ - "First let us clean the metadata by dropping all NA valued rows for the title, sha and abstract columns.\n", - "\n", - "[Pandas filter rows](https://stackoverflow.com/questions/17071871/how-to-select-rows-from-a-dataframe-based-on-column-values)" + "df_covid2 = df_covid.copy()" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ - "bio= meta.loc[meta['source_x'] == 'biorxiv']" + "df_covid2.drop_duplicates(['body_text'],inplace=True)" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 135, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Meta count:51078 biorxiv papers count: 764\n" + "\n", + "Int64Index: 1934 entries, 0 to 1933\n", + "Data columns (total 7 columns):\n", + "paper_id 1934 non-null object\n", + "abstract 1933 non-null object\n", + "body_text 1934 non-null object\n", + "authors 1934 non-null object\n", + "title 1934 non-null object\n", + "abstract_word_count 1934 non-null int64\n", + "body_word_count 1934 non-null int64\n", + "dtypes: int64(2), object(5)\n", + "memory usage: 120.9+ KB\n" ] } ], "source": [ - "print(f'Meta count:{len(meta)} biorxiv papers count: {len(bio)}')" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "bio_clean = bio.drop_duplicates().dropna()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(bio_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the above dropna condition doesn't work out, number of rows in the resulting dataframe is zero.\n", - "We found a solution Cite: [stack overflow post](https://stackoverflow.com/questions/39241346/pandas-dropna-on-specify-attribute)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "bio_clean = bio.loc[~(bio.sha.isnull())]" + "df_covid2.info()" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 136, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abstract_word_countbody_word_count
count1934.0000001934.000000
mean222.0687692986.755429
std92.7192302207.443432
min1.00000043.000000
25%152.0000001616.750000
50%216.0000002539.000000
75%270.0000003748.500000
max678.00000044519.000000
\n", + "
" + ], "text/plain": [ - "678" + " abstract_word_count body_word_count\n", + "count 1934.000000 1934.000000\n", + "mean 222.068769 2986.755429\n", + "std 92.719230 2207.443432\n", + "min 1.000000 43.000000\n", + "25% 152.000000 1616.750000\n", + "50% 216.000000 2539.000000\n", + "75% 270.000000 3748.500000\n", + "max 678.000000 44519.000000" ] }, - "execution_count": 40, + "execution_count": 136, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(bio_clean)" + "df_covid2.describe()" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 137, "metadata": {}, "outputs": [ { @@ -3426,401 +2055,499 @@ " \n", " \n", " \n", - " cord_uid\n", - " sha\n", - " source_x\n", - " title\n", - " doi\n", - " pmcid\n", - " pubmed_id\n", - " license\n", - " abstract\n", - " publish_time\n", - " authors\n", - " journal\n", - " Microsoft Academic Paper ID\n", - " WHO #Covidence\n", - " has_pdf_parse\n", - " has_pmc_xml_parse\n", - " full_text_file\n", - " url\n", + " abstract_word_count\n", + " body_word_count\n", " \n", " \n", " \n", " \n", - " 32366\n", - " vho70jcx\n", - " f056da9c64fbf00a4645ae326e8a4339d015d155\n", - " biorxiv\n", - " SIANN: Strain Identification by Alignment to N...\n", - " 10.1101/001727\n", - " NaN\n", - " NaN\n", - " biorxiv\n", - " Next-generation sequencing is increasingly bei...\n", - " 2014-01-10\n", - " Samuel Minot; Stephen D Turner; Krista L Ternu...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " True\n", - " False\n", - " biorxiv_medrxiv\n", - " https://doi.org/10.1101/001727\n", + " count\n", + " 1934.000000\n", + " 1934.000000\n", " \n", " \n", - " 32367\n", - " i9tbix2v\n", - " daf32e013d325a6feb80e83d15aabc64a48fae33\n", - " biorxiv\n", - " Spatial epidemiology of networked metapopulati...\n", - " 10.1101/003889\n", - " NaN\n", - " NaN\n", - " biorxiv\n", - " An emerging disease is one infectious epidemic...\n", - " 2014-06-04\n", - " Lin WANG; Xiang Li\n", - " NaN\n", - " NaN\n", - " NaN\n", - " True\n", - " False\n", - " biorxiv_medrxiv\n", - " https://doi.org/10.1101/003889\n", + " mean\n", + " 222.068769\n", + " 2986.755429\n", " \n", " \n", - " 32368\n", - " 62gfisc6\n", - " f33c6d94b0efaa198f8f3f20e644625fa3fe10d2\n", - " biorxiv\n", - " Sequencing of the human IG light chain loci fr...\n", - " 10.1101/006866\n", - " NaN\n", - " NaN\n", - " biorxiv\n", - " Germline variation at immunoglobulin gene (IG)...\n", - " 2014-07-03\n", - " Corey T Watson; Karyn Meltz Steinberg; Tina A ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " True\n", - " False\n", - " biorxiv_medrxiv\n", - " https://doi.org/10.1101/006866\n", + " std\n", + " 92.719230\n", + " 2207.443432\n", " \n", " \n", - " 32369\n", - " 058r9486\n", - " 4da8a87e614373d56070ed272487451266dce919\n", - " biorxiv\n", - " Bayesian mixture analysis for metagenomic comm...\n", - " 10.1101/007476\n", - " NaN\n", - " NaN\n", - " biorxiv\n", - " Deep sequencing of clinical samples is now an ...\n", - " 2014-07-25\n", - " Sofia Morfopoulou; Vincent Plagnol\n", - " NaN\n", - " NaN\n", - " NaN\n", - " True\n", - " False\n", - " biorxiv_medrxiv\n", - " https://doi.org/10.1101/007476\n", + " min\n", + " 1.000000\n", + " 43.000000\n", + " \n", + " \n", + " 25%\n", + " 152.000000\n", + " 1616.750000\n", + " \n", + " \n", + " 50%\n", + " 216.000000\n", + " 2539.000000\n", + " \n", + " \n", + " 75%\n", + " 270.000000\n", + " 3748.500000\n", " \n", " \n", - " 32370\n", - " wich35l7\n", - " eccef80cfbe078235df22398f195d5db462d8000\n", - " biorxiv\n", - " Mapping a viral phylogeny onto outbreak trees ...\n", - " 10.1101/010389\n", - " NaN\n", - " NaN\n", - " biorxiv\n", - " Developing methods to reconstruct transmission...\n", - " 2014-11-11\n", - " Stephen P Velsko; Jonathan E Allen\n", - " NaN\n", - " NaN\n", - " NaN\n", - " True\n", - " False\n", - " biorxiv_medrxiv\n", - " https://doi.org/10.1101/010389\n", + " max\n", + " 678.000000\n", + " 44519.000000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " cord_uid sha source_x \\\n", - "32366 vho70jcx f056da9c64fbf00a4645ae326e8a4339d015d155 biorxiv \n", - "32367 i9tbix2v daf32e013d325a6feb80e83d15aabc64a48fae33 biorxiv \n", - "32368 62gfisc6 f33c6d94b0efaa198f8f3f20e644625fa3fe10d2 biorxiv \n", - "32369 058r9486 4da8a87e614373d56070ed272487451266dce919 biorxiv \n", - "32370 wich35l7 eccef80cfbe078235df22398f195d5db462d8000 biorxiv \n", - "\n", - " title doi \\\n", - "32366 SIANN: Strain Identification by Alignment to N... 10.1101/001727 \n", - "32367 Spatial epidemiology of networked metapopulati... 10.1101/003889 \n", - "32368 Sequencing of the human IG light chain loci fr... 10.1101/006866 \n", - "32369 Bayesian mixture analysis for metagenomic comm... 10.1101/007476 \n", - "32370 Mapping a viral phylogeny onto outbreak trees ... 10.1101/010389 \n", - "\n", - " pmcid pubmed_id license \\\n", - "32366 NaN NaN biorxiv \n", - "32367 NaN NaN biorxiv \n", - "32368 NaN NaN biorxiv \n", - "32369 NaN NaN biorxiv \n", - "32370 NaN NaN biorxiv \n", - "\n", - " abstract publish_time \\\n", - "32366 Next-generation sequencing is increasingly bei... 2014-01-10 \n", - "32367 An emerging disease is one infectious epidemic... 2014-06-04 \n", - "32368 Germline variation at immunoglobulin gene (IG)... 2014-07-03 \n", - "32369 Deep sequencing of clinical samples is now an ... 2014-07-25 \n", - "32370 Developing methods to reconstruct transmission... 2014-11-11 \n", - "\n", - " authors journal \\\n", - "32366 Samuel Minot; Stephen D Turner; Krista L Ternu... NaN \n", - "32367 Lin WANG; Xiang Li NaN \n", - "32368 Corey T Watson; Karyn Meltz Steinberg; Tina A ... NaN \n", - "32369 Sofia Morfopoulou; Vincent Plagnol NaN \n", - "32370 Stephen P Velsko; Jonathan E Allen NaN \n", - "\n", - " Microsoft Academic Paper ID WHO #Covidence has_pdf_parse \\\n", - "32366 NaN NaN True \n", - "32367 NaN NaN True \n", - "32368 NaN NaN True \n", - "32369 NaN NaN True \n", - "32370 NaN NaN True \n", - "\n", - " has_pmc_xml_parse full_text_file url \n", - "32366 False biorxiv_medrxiv https://doi.org/10.1101/001727 \n", - "32367 False biorxiv_medrxiv https://doi.org/10.1101/003889 \n", - "32368 False biorxiv_medrxiv https://doi.org/10.1101/006866 \n", - "32369 False biorxiv_medrxiv https://doi.org/10.1101/007476 \n", - "32370 False biorxiv_medrxiv https://doi.org/10.1101/010389 " + " abstract_word_count body_word_count\n", + "count 1934.000000 1934.000000\n", + "mean 222.068769 2986.755429\n", + "std 92.719230 2207.443432\n", + "min 1.000000 43.000000\n", + "25% 152.000000 1616.750000\n", + "50% 216.000000 2539.000000\n", + "75% 270.000000 3748.500000\n", + "max 678.000000 44519.000000" ] }, - "execution_count": 41, + "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bio_clean.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['biorxiv'] [nan] [nan] [nan] [nan]\n" - ] - } - ], - "source": [ - "print(bio_clean['source_x'].unique(),\n", - "bio_clean['pmcid'].unique(),\n", - "bio_clean['pubmed_id'].unique(),\n", - "bio_clean['Microsoft Academic Paper ID'].unique(),\n", - "bio_clean['WHO #Covidence'].unique())" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "bio_slim = bio_clean.drop(['Microsoft Academic Paper ID','WHO #Covidence','pubmed_id','pmcid'], axis = 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "bio_slim = bio_slim.drop(['url','doi','license','full_text_file'],axis =1 )" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "bio_slim = bio_slim[~bio_slim.has_pdf_parse.isin(['False'])]" + "df_covid.describe()" ] }, { - "cell_type": "code", - "execution_count": 46, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "bio_slim =bio_slim.loc[~(bio_slim.abstract.isnull())]" + "We now clean the text data so that our key words are clear and we do not have to worry about details we are not concerned about. [ref1](https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79) [ref2](https://www.geeksforgeeks.org/text-preprocessing-in-python-set-1/)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As we had seen at the send of section 2.3, all_json contains the paths of all the json files(i.e. the articles) in it.\n", - "Now we should load this into a dataframe for it to be queryable" + "Firstly By now looking at the abstract columns, we must notice that the text scraped from these articles is not perfect.\n", + "\n", + "For example, many of the abstracts start with the word *Abstract* glued to the beginning of the first word of the real abstract body. " ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 138, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
00015023cc06b5362d332b3baf348d11567ca2fbbThe positive stranded RNA genomes of picornavi...VP3, and VP0 (which is further processed to VP...Joseph C. Ward. Lidia Lasecka-Dykes...(The RNA pseudoknots in foot-and-mouth disease...1941709
100340eea543336d54adda18236424de6a5e91c9dDuring the past three months, a new coronaviru...In December 2019, a novel coronavirus, SARS-Co...Carla Mavian. Simone Marini...(Regaining perspective on SARS-CoV-2 molecular...1382487
2004f0f8bb66cf446678dc13cf2701feec4f36d76We integrate the human movement and healthcare...The 2019-nCoV epidemic has spread across China...Hanchu Zhou. Jianan Yang...(Healthcare-resource-adjusted vulnerabilities ...34749
3005d189d5bd7ac01aee65e934fd3d5186a3f7b27The rapid outbreak of the new Coronavirus (COV...The outbreak of infectious diseases has always...Behzad Pirouz. Amirsina Golmohammadi...(Relationship between Average Daily Temperatur...2492791
400911cf4f99a3d5ae5e5b787675646a743574496ABSTRARCTThe fast accumulation of viral metage...Metagenomic sequencing, which allows us to dir...Jiayu Shang. Yanni Sun(CHEER: hierarCHical taxonomic classification ...1395153
5009ceadbb6f9f0d597b9516f0b9ca01e40231aadObjective: Describe the changes in patient pop...Since its emergence in December 2019 the novel...Omar Badawi. Xinggang Liu...(Impact of COVID-19 pandemic on severity of il...2921409
600d16927588fb04d4be0e6b269fc02f0d3c2aa7bInfectious bronchitis (IB) causes significant ...Infectious bronchitis (IB), which is caused by...Salman L. Butt. Eric C. Erwood...(Real-time, MinION-based, amplicon sequencing ...2483958
700eb9220dc8cd351393b6b035323d350f103f8c6Importance: As with other traumatic events, pa...Evidence from prior pandemics suggests that, a...Victor M. Castro. Roy H Perlis(Impact of COVID-19 on psychiatric assessment ...2462103
80109d1273b2d59a099ab66cdad6939d5e7fcb2e8In severe cases of coronavirus disease 2019 (C...To date, the coronavirus disease 2019 (COVID-1...Yu Zuo. Srilakshmi Yalavarthi...(Neutrophil extracellular traps (NETs) as mark...2472734
90139ea4ca580af99b602c6435368e7fdbefacb03BackgroundNipah Virus (NiV) came into limeligh...Nipah is an infectious negative-sense single-s...Nishi Kumari. Ayush Upadhyay...(A Combined Evidence Approach to Prioritize Ni...3262382
\n", + "
" + ], "text/plain": [ - "Article Object with id: e6d882be4961d1bdd7507b4a29d86b650de0895d \n", - " title:SEARCH STRATEGY PubMed databases were searched for studies on MERS cases during the 2015 outbreak in the Republic of Korea. The search terms used were combinations of 'Middle East respiratory syndrome' in all fields and 'Korea' in all \n", - " abstract:Middle East Respiratory Syndrome coronavirus (MERS-CoV) was first isolated from a patient with severe pneumonia in 2012. The 2015 Korea outbreak of MERS-CoV involved 186 cases, including 38 fatalities. A total of 83% of transmission events were due to five superspreaders, and 44% of the 186 MERS cases were the patients who had been exposed in nosocomial transmission at 16 hospitals. The epidemic lasted for 2 months and the government quarantined 16,993 individuals for 14 days to control the outbreak. This outbreak provides a unique opportunity to fill the gap in our knowledge of MERS-CoV infection. Therefore, in this paper, we review the literature on epidemiology, virology, clinical features, and prevention of MERS-CoV, which were acquired from the 2015 Korea outbreak of MERS-CoV." + " paper_id \\\n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", + "5 009ceadbb6f9f0d597b9516f0b9ca01e40231aad \n", + "6 00d16927588fb04d4be0e6b269fc02f0d3c2aa7b \n", + "7 00eb9220dc8cd351393b6b035323d350f103f8c6 \n", + "8 0109d1273b2d59a099ab66cdad6939d5e7fcb2e8 \n", + "9 0139ea4ca580af99b602c6435368e7fdbefacb03 \n", + "\n", + " abstract \\\n", + "0 The positive stranded RNA genomes of picornavi... \n", + "1 During the past three months, a new coronaviru... \n", + "2 We integrate the human movement and healthcare... \n", + "3 The rapid outbreak of the new Coronavirus (COV... \n", + "4 ABSTRARCTThe fast accumulation of viral metage... \n", + "5 Objective: Describe the changes in patient pop... \n", + "6 Infectious bronchitis (IB) causes significant ... \n", + "7 Importance: As with other traumatic events, pa... \n", + "8 In severe cases of coronavirus disease 2019 (C... \n", + "9 BackgroundNipah Virus (NiV) came into limeligh... \n", + "\n", + " body_text \\\n", + "0 VP3, and VP0 (which is further processed to VP... \n", + "1 In December 2019, a novel coronavirus, SARS-Co... \n", + "2 The 2019-nCoV epidemic has spread across China... \n", + "3 The outbreak of infectious diseases has always... \n", + "4 Metagenomic sequencing, which allows us to dir... \n", + "5 Since its emergence in December 2019 the novel... \n", + "6 Infectious bronchitis (IB), which is caused by... \n", + "7 Evidence from prior pandemics suggests that, a... \n", + "8 To date, the coronavirus disease 2019 (COVID-1... \n", + "9 Nipah is an infectious negative-sense single-s... \n", + "\n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", + "5 Omar Badawi. Xinggang Liu... \n", + "6 Salman L. Butt. Eric C. Erwood... \n", + "7 Victor M. Castro. Roy H Perlis \n", + "8 Yu Zuo. Srilakshmi Yalavarthi... \n", + "9 Nishi Kumari. Ayush Upadhyay... \n", + "\n", + " title abstract_word_count \\\n", + "0 (The RNA pseudoknots in foot-and-mouth disease... 194 \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... 138 \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... 34 \n", + "3 (Relationship between Average Daily Temperatur... 249 \n", + "4 (CHEER: hierarCHical taxonomic classification ... 139 \n", + "5 (Impact of COVID-19 pandemic on severity of il... 292 \n", + "6 (Real-time, MinION-based, amplicon sequencing ... 248 \n", + "7 (Impact of COVID-19 on psychiatric assessment ... 246 \n", + "8 (Neutrophil extracellular traps (NETs) as mark... 247 \n", + "9 (A Combined Evidence Approach to Prioritize Ni... 326 \n", + "\n", + " body_word_count \n", + "0 1709 \n", + "1 2487 \n", + "2 749 \n", + "3 2791 \n", + "4 5153 \n", + "5 1409 \n", + "6 3958 \n", + "7 2103 \n", + "8 2734 \n", + "9 2382 " ] }, - "execution_count": 47, + "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "_125json[0]\n", - "Article(_125json[0])" + "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).replace('Abstract',''))\n", + "df_covid2.head(10)" ] }, { - "cell_type": "code", - "execution_count": 48, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing index: 0 of 12500\n", - "Processing index: 1250 of 12500\n", - "Processing index: 2500 of 12500\n", - "Processing index: 3750 of 12500\n", - "Processing index: 5000 of 12500\n", - "Processing index: 6250 of 12500\n", - "Processing index: 7500 of 12500\n", - "Processing index: 8750 of 12500\n", - "Processing index: 10000 of 12500\n", - "Processing index: 11250 of 12500\n", - " time taken : 0:04:34.872012\n" - ] - } - ], "source": [ - "start_time = datetime.now()\n", - "dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': []}\n", - "for idx, entry in enumerate(_125json):\n", - " if idx % (len(_125json) // 10) == 0:\n", - " print(f'Processing index: {idx} of {len(_125json)}')\n", - " content = Article(entry)\n", - " \n", - " # get metadata information\n", - " meta_data = meta.loc[meta['sha'] == content.paper_id]\n", - " # no metadata, skip this paper\n", - " if len(meta_data) == 0:\n", - " continue\n", - " \n", - "\n", - " dict_['body_text'].append(content.body_text)\n", - " \n", - " \n", - " # get metadata information\n", - " meta_data = meta.loc[meta['sha'] == content.paper_id]\n", - " \n", - " try:\n", - " # if more than one author\n", - " authors = meta_data['authors'].values[0].split(';')\n", - " if len(authors) > 2:\n", - " # more than 2 authors, may be problem when plotting, so take first 2 append with ...\n", - " dict_['authors'].append(\". \".join(authors[:2]) + \"...\")\n", - " else:\n", - " # authors will fit in plot\n", - " dict_['authors'].append(\". \".join(authors))\n", - " except Exception as e:\n", - " # if only one author - or Null valie\n", - " dict_['authors'].append(meta_data['authors'].values[0])\n", - " \n", - " # add the title information, add breaks when needed\n", - " try:\n", - " title = meta_data['title'].values[0], 40\n", - " dict_['title'].append(title)\n", - " # if title was not provided\n", - " except Exception as e:\n", - " dict_['title'].append(meta_data['title'].values[0])\n", - " \n", - " \n", - " dict_['paper_id'].append(meta_data['sha'].values[0])\n", - " dict_['abstract'].append(meta_data['abstract'].values[0])\n", - " \n", - "df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title'])\n", - "df_covid.head()\n", - "end_time = datetime.now()\n", - "print(f' time taken : {(end_time - start_time)}')" + "Now let us remove all punctuation from text and change it to lowercase" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Preprocessing" ] }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "scrolled": true - }, + "execution_count": 139, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "8811 8811 8811 8811 8811\n" + "imported text preprocessing libraries, time taken:0:00:00\n" ] } ], "source": [ - "print(len(dict_['paper_id']),len(dict_['abstract']),len(dict_['body_text']),len(dict_['authors']),len(dict_['title']))" + "start_time = datetime.now()\n", + "import nltk \n", + "import string \n", + "import re \n", + "end_time = datetime.now()\n", + "print(f'imported text preprocessing libraries, time taken:{end_time-start_time}')" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 140, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
00015023cc06b5362d332b3baf348d11567ca2fbbthe positive stranded rna genomes of picornavi...VP3, and VP0 (which is further processed to VP...Joseph C. Ward. Lidia Lasecka-Dykes...(The RNA pseudoknots in foot-and-mouth disease...1941709
100340eea543336d54adda18236424de6a5e91c9dduring the past three months, a new coronaviru...In December 2019, a novel coronavirus, SARS-Co...Carla Mavian. Simone Marini...(Regaining perspective on SARS-CoV-2 molecular...1382487
2004f0f8bb66cf446678dc13cf2701feec4f36d76we integrate the human movement and healthcare...The 2019-nCoV epidemic has spread across China...Hanchu Zhou. Jianan Yang...(Healthcare-resource-adjusted vulnerabilities ...34749
3005d189d5bd7ac01aee65e934fd3d5186a3f7b27the rapid outbreak of the new coronavirus (cov...The outbreak of infectious diseases has always...Behzad Pirouz. Amirsina Golmohammadi...(Relationship between Average Daily Temperatur...2492791
400911cf4f99a3d5ae5e5b787675646a743574496abstrarctthe fast accumulation of viral metage...Metagenomic sequencing, which allows us to dir...Jiayu Shang. Yanni Sun(CHEER: hierarCHical taxonomic classification ...1395153
\n", + "
" + ], "text/plain": [ - "123" + " paper_id \\\n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", + "\n", + " abstract \\\n", + "0 the positive stranded rna genomes of picornavi... \n", + "1 during the past three months, a new coronaviru... \n", + "2 we integrate the human movement and healthcare... \n", + "3 the rapid outbreak of the new coronavirus (cov... \n", + "4 abstrarctthe fast accumulation of viral metage... \n", + "\n", + " body_text \\\n", + "0 VP3, and VP0 (which is further processed to VP... \n", + "1 In December 2019, a novel coronavirus, SARS-Co... \n", + "2 The 2019-nCoV epidemic has spread across China... \n", + "3 The outbreak of infectious diseases has always... \n", + "4 Metagenomic sequencing, which allows us to dir... \n", + "\n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", + "\n", + " title abstract_word_count \\\n", + "0 (The RNA pseudoknots in foot-and-mouth disease... 194 \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... 138 \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... 34 \n", + "3 (Relationship between Average Daily Temperatur... 249 \n", + "4 (CHEER: hierarCHical taxonomic classification ... 139 \n", + "\n", + " body_word_count \n", + "0 1709 \n", + "1 2487 \n", + "2 749 \n", + "3 2791 \n", + "4 5153 " ] }, - "execution_count": 50, + "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(df_covid['abstract'][0].strip().split())" + "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).lower())\n", + "df_covid2.head()" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 141, "metadata": {}, "outputs": [ { @@ -3856,103 +2583,53 @@ " \n", " \n", " 0\n", - " e6d882be4961d1bdd7507b4a29d86b650de0895d\n", - " Middle East Respiratory Syndrome coronavirus (...\n", - " Middle East respiratory syndrome coronavirus (...\n", - " Oh, Myoung-don. Park, Wan Beom...\n", - " (Middle East respiratory syndrome: what we lea...\n", - " 123\n", - " 3907\n", + " 0015023cc06b5362d332b3baf348d11567ca2fbb\n", + " the positive stranded rna genomes of picornavi...\n", + " vp3, and vp0 (which is further processed to vp...\n", + " Joseph C. Ward. Lidia Lasecka-Dykes...\n", + " (The RNA pseudoknots in foot-and-mouth disease...\n", + " 194\n", + " 1709\n", " \n", " \n", " 1\n", - " d0c4236f95ba2b3d5cdc71fde524b26262f05dd3\n", - " In theoretical physics, there exist two basic ...\n", - " In theoretical physics, there exist two basic ...\n", - " Zhang, Ren. Zhang, Chun-Ting\n", - " (A Brief Review: The Z-curve Theory and its Ap...\n", - " 169\n", - " 6359\n", + " 00340eea543336d54adda18236424de6a5e91c9d\n", + " during the past three months, a new coronaviru...\n", + " in december 2019, a novel coronavirus, sars-co...\n", + " Carla Mavian. Simone Marini...\n", + " (Regaining perspective on SARS-CoV-2 molecular...\n", + " 138\n", + " 2487\n", " \n", " \n", " 2\n", - " d24afd9ee025b53015824be203db539009964fbd\n", - " BACKGROUND: The aim of this research was to ev...\n", - " In view of the role of thin-section CT scannin...\n", - " Xing, Zhi-Heng. Sun, Xin...\n", - " (Thin-section Computed Tomography Detects Long...\n", - " 326\n", - " 2937\n", + " 004f0f8bb66cf446678dc13cf2701feec4f36d76\n", + " we integrate the human movement and healthcare...\n", + " the 2019-ncov epidemic has spread across china...\n", + " Hanchu Zhou. Jianan Yang...\n", + " (Healthcare-resource-adjusted vulnerabilities ...\n", + " 34\n", + " 749\n", " \n", " \n", " 3\n", - " 4a3941003ea2673397975ae8bc2536ad59f789e5\n", - " INTRODUCTION: The Major Incident Hospital (MIH...\n", - " injury. Medical management does not simply inv...\n", - " Marres, G. M. H.. van der Eijk, J....\n", - " (Evaluation of admissions to the Major Inciden...\n", - " 264\n", - " 4830\n", + " 005d189d5bd7ac01aee65e934fd3d5186a3f7b27\n", + " the rapid outbreak of the new coronavirus (cov...\n", + " the outbreak of infectious diseases has always...\n", + " Behzad Pirouz. Amirsina Golmohammadi...\n", + " (Relationship between Average Daily Temperatur...\n", + " 249\n", + " 2791\n", " \n", " \n", " 4\n", - " 905e2b35719215760ebed8f3e93fdf6ea8ec9bcf\n", - " Mice infected with the neurotropic JHM strain ...\n", - " 1. Frequently change and/or sterilize gloves t...\n", - " Carbajal, Kevin S.. Weinger, Jason G....\n", - " (Surgical Transplantation of Mouse Neural Stem...\n", - " 137\n", - " 1300\n", - " \n", - " \n", - " 5\n", - " 319004f23d1af4357edb2a3862f2619be23a21a6\n", - " Diagnostics play a central role in the early d...\n", - " ► The Middle East respiratory syndrome-coronav...\n", - " Kelly-Cirino, Cassandra. Mazzola, Laura T...\n", - " (An updated roadmap for MERS-CoV research and ...\n", - " 212\n", - " 3632\n", - " \n", - " \n", - " 6\n", - " 4971940ab68e4950b28410a208166900e8ba9c07\n", - " Despite of the role of domestic dogs as reserv...\n", - " Since its initial identification in the late 1...\n", - " Vieira, Flávia V.. Hoffmann, Daniel J....\n", - " (Circulation of canine parvovirus among dogs l...\n", - " 137\n", - " 1519\n", - " \n", - " \n", - " 7\n", - " cae1f2fd5785845caa42d6497361eba46a68f11d\n", - " Diversity and plasticity are two hallmarks of ...\n", - " Macrophages were first identified by Elie Metc...\n", - " Liu, Yan-Cun. Zou, Xian-Biao...\n", - " (Macrophage Polarization in Inflammatory Disea...\n", - " 111\n", - " 4608\n", - " \n", - " \n", - " 8\n", - " 2ffddf5caaef38207b58710a93ee8361518813c9\n", - " This commentary offers suggestions for improvi...\n", - " I n the early 20th century, the new field of m...\n", - " Kahn, Laura H.\n", - " (The need for one health degree programs, 40)\n", - " 56\n", - " 1177\n", - " \n", - " \n", - " 9\n", - " 4bae83e2441c3738d96e49c21c9be0a4c85b4a92\n", - " The protective efficacy of DNA plasmids encodi...\n", - " Avian infectious bronchitis (IB) is a major di...\n", - " Yan, Fang. Zhao, Yujun...\n", - " (Protection of chickens against infectious bro...\n", - " 179\n", - " 2987\n", + " 00911cf4f99a3d5ae5e5b787675646a743574496\n", + " abstrarctthe fast accumulation of viral metage...\n", + " metagenomic sequencing, which allows us to dir...\n", + " Jiayu Shang. Yanni Sun\n", + " (CHEER: hierarCHical taxonomic classification ...\n", + " 139\n", + " 5153\n", " \n", " \n", "\n", @@ -3960,254 +2637,394 @@ ], "text/plain": [ " paper_id \\\n", - "0 e6d882be4961d1bdd7507b4a29d86b650de0895d \n", - "1 d0c4236f95ba2b3d5cdc71fde524b26262f05dd3 \n", - "2 d24afd9ee025b53015824be203db539009964fbd \n", - "3 4a3941003ea2673397975ae8bc2536ad59f789e5 \n", - "4 905e2b35719215760ebed8f3e93fdf6ea8ec9bcf \n", - "5 319004f23d1af4357edb2a3862f2619be23a21a6 \n", - "6 4971940ab68e4950b28410a208166900e8ba9c07 \n", - "7 cae1f2fd5785845caa42d6497361eba46a68f11d \n", - "8 2ffddf5caaef38207b58710a93ee8361518813c9 \n", - "9 4bae83e2441c3738d96e49c21c9be0a4c85b4a92 \n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", "\n", " abstract \\\n", - "0 Middle East Respiratory Syndrome coronavirus (... \n", - "1 In theoretical physics, there exist two basic ... \n", - "2 BACKGROUND: The aim of this research was to ev... \n", - "3 INTRODUCTION: The Major Incident Hospital (MIH... \n", - "4 Mice infected with the neurotropic JHM strain ... \n", - "5 Diagnostics play a central role in the early d... \n", - "6 Despite of the role of domestic dogs as reserv... \n", - "7 Diversity and plasticity are two hallmarks of ... \n", - "8 This commentary offers suggestions for improvi... \n", - "9 The protective efficacy of DNA plasmids encodi... \n", + "0 the positive stranded rna genomes of picornavi... \n", + "1 during the past three months, a new coronaviru... \n", + "2 we integrate the human movement and healthcare... \n", + "3 the rapid outbreak of the new coronavirus (cov... \n", + "4 abstrarctthe fast accumulation of viral metage... \n", "\n", " body_text \\\n", - "0 Middle East respiratory syndrome coronavirus (... \n", - "1 In theoretical physics, there exist two basic ... \n", - "2 In view of the role of thin-section CT scannin... \n", - "3 injury. Medical management does not simply inv... \n", - "4 1. Frequently change and/or sterilize gloves t... \n", - "5 ► The Middle East respiratory syndrome-coronav... \n", - "6 Since its initial identification in the late 1... \n", - "7 Macrophages were first identified by Elie Metc... \n", - "8 I n the early 20th century, the new field of m... \n", - "9 Avian infectious bronchitis (IB) is a major di... \n", + "0 vp3, and vp0 (which is further processed to vp... \n", + "1 in december 2019, a novel coronavirus, sars-co... \n", + "2 the 2019-ncov epidemic has spread across china... \n", + "3 the outbreak of infectious diseases has always... \n", + "4 metagenomic sequencing, which allows us to dir... \n", "\n", - " authors \\\n", - "0 Oh, Myoung-don. Park, Wan Beom... \n", - "1 Zhang, Ren. Zhang, Chun-Ting \n", - "2 Xing, Zhi-Heng. Sun, Xin... \n", - "3 Marres, G. M. H.. van der Eijk, J.... \n", - "4 Carbajal, Kevin S.. Weinger, Jason G.... \n", - "5 Kelly-Cirino, Cassandra. Mazzola, Laura T... \n", - "6 Vieira, Flávia V.. Hoffmann, Daniel J.... \n", - "7 Liu, Yan-Cun. Zou, Xian-Biao... \n", - "8 Kahn, Laura H. \n", - "9 Yan, Fang. Zhao, Yujun... \n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", "\n", " title abstract_word_count \\\n", - "0 (Middle East respiratory syndrome: what we lea... 123 \n", - "1 (A Brief Review: The Z-curve Theory and its Ap... 169 \n", - "2 (Thin-section Computed Tomography Detects Long... 326 \n", - "3 (Evaluation of admissions to the Major Inciden... 264 \n", - "4 (Surgical Transplantation of Mouse Neural Stem... 137 \n", - "5 (An updated roadmap for MERS-CoV research and ... 212 \n", - "6 (Circulation of canine parvovirus among dogs l... 137 \n", - "7 (Macrophage Polarization in Inflammatory Disea... 111 \n", - "8 (The need for one health degree programs, 40) 56 \n", - "9 (Protection of chickens against infectious bro... 179 \n", + "0 (The RNA pseudoknots in foot-and-mouth disease... 194 \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... 138 \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... 34 \n", + "3 (Relationship between Average Daily Temperatur... 249 \n", + "4 (CHEER: hierarCHical taxonomic classification ... 139 \n", "\n", " body_word_count \n", - "0 3907 \n", - "1 6359 \n", - "2 2937 \n", - "3 4830 \n", - "4 1300 \n", - "5 3632 \n", - "6 1519 \n", - "7 4608 \n", - "8 1177 \n", - "9 2987 " + "0 1709 \n", + "1 2487 \n", + "2 749 \n", + "3 2791 \n", + "4 5153 " ] }, - "execution_count": 51, + "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(str(x).strip().split()))\n", - "df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(str(x).strip().split()))\n", - "df_covid.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset statistics" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total number of words across all abstracts = 1383457. Average abstract length = 157.0147542844172\n" - ] - } - ], - "source": [ - "totalnumberofwords=df_covid['abstract_word_count'].sum()\n", - "avg_abstract_len = totalnumberofwords/len(df_covid['abstract_word_count'])\n", - "print(f'Total number of words across all abstracts = {totalnumberofwords}. Average abstract length = {avg_abstract_len}')" + "df_covid2['body_text'] = df_covid2['body_text'].apply(lambda x: str(x).lower())\n", + "df_covid2.head()" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 142, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "RangeIndex: 8811 entries, 0 to 8810\n", - "Data columns (total 7 columns):\n", - "paper_id 8811 non-null object\n", - "abstract 7502 non-null object\n", - "body_text 8811 non-null object\n", - "authors 8543 non-null object\n", - "title 8811 non-null object\n", - "abstract_word_count 8811 non-null int64\n", - "body_word_count 8811 non-null int64\n", - "dtypes: int64(2), object(5)\n", - "memory usage: 481.9+ KB\n" + "Punctuated:\n", + " during the past three months, a new coronavirus (sars-cov-2) epidemic has been growing exponentially, affecting over 100 thousand people worldwide, and causing enormous distress to economies and societies of affected countries. a plethora of analyses based on viral sequences has already been published, in scientific journals as well as through non-peer reviewed channels, to investigate sars-cov-2 genetic heterogeneity and spatiotemporal dissemination. we examined all full genome sequences currently available to assess the presence of sufficient information for reliable phylogenetic and phylogeographic studies. our analysis clearly shows severe limitations in the present data, in light of which any finding should be considered, at the very best, preliminary and hypothesis-generating. hence the need for avoiding stigmatization based on partial information, and for continuing concerted efforts to increase number and quality of the sequences required for robust tracing of the epidemic. \n", + "\n", + "\n", + "\n", + "Cleaned:\n", + " during the past three months a new coronavirus sarscov2 epidemic has been growing exponentially affecting over 100 thousand people worldwide and causing enormous distress to economies and societies of affected countries a plethora of analyses based on viral sequences has already been published in scientific journals as well as through nonpeer reviewed channels to investigate sarscov2 genetic heterogeneity and spatiotemporal dissemination we examined all full genome sequences currently available to assess the presence of sufficient information for reliable phylogenetic and phylogeographic studies our analysis clearly shows severe limitations in the present data in light of which any finding should be considered at the very best preliminary and hypothesisgenerating hence the need for avoiding stigmatization based on partial information and for continuing concerted efforts to increase number and quality of the sequences required for robust tracing of the epidemic\n" ] } ], "source": [ - "df_covid.info()" + "print(\"Punctuated:\\n\",df_covid2['abstract'][1],'\\n\\n\\n')\n", + "translator = str.maketrans('','',string.punctuation)\n", + "print(\"Cleaned:\\n\",df_covid2['abstract'][1].translate(translator))" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 143, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "count 7502\n", - "unique 7480\n", - "top Unknown\n", - "freq 20\n", - "Name: abstract, dtype: object" + "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" ] }, - "execution_count": 53, + "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid['abstract'].describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The difference between the unique count and the total count means that either there are some duplicate abstracts or there are blank abstracts that are being counted as the same. " + "string.punctuation" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 144, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
00015023cc06b5362d332b3baf348d11567ca2fbbthe positive stranded rna genomes of picornavi...vp3, and vp0 (which is further processed to vp...Joseph C. Ward. Lidia Lasecka-Dykes...(The RNA pseudoknots in foot-and-mouth disease...1941709
100340eea543336d54adda18236424de6a5e91c9dduring the past three months a new coronavirus...in december 2019, a novel coronavirus, sars-co...Carla Mavian. Simone Marini...(Regaining perspective on SARS-CoV-2 molecular...1382487
2004f0f8bb66cf446678dc13cf2701feec4f36d76we integrate the human movement and healthcare...the 2019-ncov epidemic has spread across china...Hanchu Zhou. Jianan Yang...(Healthcare-resource-adjusted vulnerabilities ...34749
3005d189d5bd7ac01aee65e934fd3d5186a3f7b27the rapid outbreak of the new coronavirus covi...the outbreak of infectious diseases has always...Behzad Pirouz. Amirsina Golmohammadi...(Relationship between Average Daily Temperatur...2492791
400911cf4f99a3d5ae5e5b787675646a743574496abstrarctthe fast accumulation of viral metage...metagenomic sequencing, which allows us to dir...Jiayu Shang. Yanni Sun(CHEER: hierarCHical taxonomic classification ...1395153
\n", + "
" + ], "text/plain": [ - "count 8811\n", - "unique 8807\n", - "top In previous reports, workers have characterize...\n", - "freq 2\n", - "Name: body_text, dtype: object" + " paper_id \\\n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", + "\n", + " abstract \\\n", + "0 the positive stranded rna genomes of picornavi... \n", + "1 during the past three months a new coronavirus... \n", + "2 we integrate the human movement and healthcare... \n", + "3 the rapid outbreak of the new coronavirus covi... \n", + "4 abstrarctthe fast accumulation of viral metage... \n", + "\n", + " body_text \\\n", + "0 vp3, and vp0 (which is further processed to vp... \n", + "1 in december 2019, a novel coronavirus, sars-co... \n", + "2 the 2019-ncov epidemic has spread across china... \n", + "3 the outbreak of infectious diseases has always... \n", + "4 metagenomic sequencing, which allows us to dir... \n", + "\n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", + "\n", + " title abstract_word_count \\\n", + "0 (The RNA pseudoknots in foot-and-mouth disease... 194 \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... 138 \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... 34 \n", + "3 (Relationship between Average Daily Temperatur... 249 \n", + "4 (CHEER: hierarCHical taxonomic classification ... 139 \n", + "\n", + " body_word_count \n", + "0 1709 \n", + "1 2487 \n", + "2 749 \n", + "3 2791 \n", + "4 5153 " ] }, - "execution_count": 54, + "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid['body_text'].describe(include='all')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The difference between the unique count and the total count means that there are some duplicate articles present in the dataset. This coulde be possible if the authors had sent their papers to different journals." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "df_covid2 = df_covid.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "df_covid2.drop_duplicates(['body_text'],inplace=True)" + "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).translate(translator))\n", + "df_covid2.head()" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 145, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 8807 entries, 0 to 8810\n", - "Data columns (total 7 columns):\n", - "paper_id 8807 non-null object\n", - "abstract 7500 non-null object\n", - "body_text 8807 non-null object\n", - "authors 8539 non-null object\n", - "title 8807 non-null object\n", - "abstract_word_count 8807 non-null int64\n", - "body_word_count 8807 non-null int64\n", - "dtypes: int64(2), object(5)\n", - "memory usage: 550.4+ KB\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
00015023cc06b5362d332b3baf348d11567ca2fbbthe positive stranded rna genomes of picornavi...vp3 and vp0 which is further processed to vp2 ...Joseph C. Ward. Lidia Lasecka-Dykes...(The RNA pseudoknots in foot-and-mouth disease...1941709
100340eea543336d54adda18236424de6a5e91c9dduring the past three months a new coronavirus...in december 2019 a novel coronavirus sarscov2 ...Carla Mavian. Simone Marini...(Regaining perspective on SARS-CoV-2 molecular...1382487
2004f0f8bb66cf446678dc13cf2701feec4f36d76we integrate the human movement and healthcare...the 2019ncov epidemic has spread across china ...Hanchu Zhou. Jianan Yang...(Healthcare-resource-adjusted vulnerabilities ...34749
3005d189d5bd7ac01aee65e934fd3d5186a3f7b27the rapid outbreak of the new coronavirus covi...the outbreak of infectious diseases has always...Behzad Pirouz. Amirsina Golmohammadi...(Relationship between Average Daily Temperatur...2492791
400911cf4f99a3d5ae5e5b787675646a743574496abstrarctthe fast accumulation of viral metage...metagenomic sequencing which allows us to dire...Jiayu Shang. Yanni Sun(CHEER: hierarCHical taxonomic classification ...1395153
\n", + "
" + ], + "text/plain": [ + " paper_id \\\n", + "0 0015023cc06b5362d332b3baf348d11567ca2fbb \n", + "1 00340eea543336d54adda18236424de6a5e91c9d \n", + "2 004f0f8bb66cf446678dc13cf2701feec4f36d76 \n", + "3 005d189d5bd7ac01aee65e934fd3d5186a3f7b27 \n", + "4 00911cf4f99a3d5ae5e5b787675646a743574496 \n", + "\n", + " abstract \\\n", + "0 the positive stranded rna genomes of picornavi... \n", + "1 during the past three months a new coronavirus... \n", + "2 we integrate the human movement and healthcare... \n", + "3 the rapid outbreak of the new coronavirus covi... \n", + "4 abstrarctthe fast accumulation of viral metage... \n", + "\n", + " body_text \\\n", + "0 vp3 and vp0 which is further processed to vp2 ... \n", + "1 in december 2019 a novel coronavirus sarscov2 ... \n", + "2 the 2019ncov epidemic has spread across china ... \n", + "3 the outbreak of infectious diseases has always... \n", + "4 metagenomic sequencing which allows us to dire... \n", + "\n", + " authors \\\n", + "0 Joseph C. Ward. Lidia Lasecka-Dykes... \n", + "1 Carla Mavian. Simone Marini... \n", + "2 Hanchu Zhou. Jianan Yang... \n", + "3 Behzad Pirouz. Amirsina Golmohammadi... \n", + "4 Jiayu Shang. Yanni Sun \n", + "\n", + " title abstract_word_count \\\n", + "0 (The RNA pseudoknots in foot-and-mouth disease... 194 \n", + "1 (Regaining perspective on SARS-CoV-2 molecular... 138 \n", + "2 (Healthcare-resource-adjusted vulnerabilities ... 34 \n", + "3 (Relationship between Average Daily Temperatur... 249 \n", + "4 (CHEER: hierarCHical taxonomic classification ... 139 \n", + "\n", + " body_word_count \n", + "0 1709 \n", + "1 2487 \n", + "2 749 \n", + "3 2791 \n", + "4 5153 " + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "df_covid2.info()" + "df_covid2['body_text'] = df_covid2['body_text'].apply(lambda x: str(x).translate(translator))\n", + "df_covid2.head()" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 146, "metadata": {}, "outputs": [ { @@ -4231,79 +3048,56 @@ " \n", " \n", " \n", - " abstract_word_count\n", - " body_word_count\n", + " abstract\n", " \n", " \n", " \n", " \n", - " count\n", - " 8807.000000\n", - " 8807.000000\n", - " \n", - " \n", - " mean\n", - " 157.050074\n", - " 4734.409561\n", - " \n", - " \n", - " std\n", - " 97.781201\n", - " 7540.143374\n", - " \n", - " \n", - " min\n", - " 1.000000\n", - " 1.000000\n", + " 0\n", + " the positive stranded rna genomes of picornavi...\n", " \n", " \n", - " 25%\n", - " 99.000000\n", - " 2150.000000\n", + " 1\n", + " during the past three months a new coronavirus...\n", " \n", " \n", - " 50%\n", - " 162.000000\n", - " 3426.000000\n", + " 2\n", + " we integrate the human movement and healthcare...\n", " \n", " \n", - " 75%\n", - " 220.000000\n", - " 5343.500000\n", + " 3\n", + " the rapid outbreak of the new coronavirus covi...\n", " \n", " \n", - " max\n", - " 1163.000000\n", - " 257301.000000\n", + " 4\n", + " abstrarctthe fast accumulation of viral metage...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " abstract_word_count body_word_count\n", - "count 8807.000000 8807.000000\n", - "mean 157.050074 4734.409561\n", - "std 97.781201 7540.143374\n", - "min 1.000000 1.000000\n", - "25% 99.000000 2150.000000\n", - "50% 162.000000 3426.000000\n", - "75% 220.000000 5343.500000\n", - "max 1163.000000 257301.000000" + " abstract\n", + "0 the positive stranded rna genomes of picornavi...\n", + "1 during the past three months a new coronavirus...\n", + "2 we integrate the human movement and healthcare...\n", + "3 the rapid outbreak of the new coronavirus covi...\n", + "4 abstrarctthe fast accumulation of viral metage..." ] }, - "execution_count": 58, + "execution_count": 146, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid2.describe()" + "abstracts = df_covid2.drop([\"paper_id\",\"body_text\", \"abstract_word_count\", \"body_word_count\", \"authors\", \"title\"], axis=1)\n", + "abstracts.head()" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 147, "metadata": {}, "outputs": [ { @@ -4327,95 +3121,73 @@ " \n", " \n", " \n", - " abstract_word_count\n", - " body_word_count\n", + " body_text\n", " \n", " \n", " \n", " \n", - " count\n", - " 8811.000000\n", - " 8811.000000\n", - " \n", - " \n", - " mean\n", - " 157.014754\n", - " 4735.074793\n", - " \n", - " \n", - " std\n", - " 97.787300\n", - " 7538.883651\n", - " \n", - " \n", - " min\n", - " 1.000000\n", - " 1.000000\n", + " 0\n", + " vp3 and vp0 which is further processed to vp2 ...\n", " \n", " \n", - " 25%\n", - " 99.000000\n", - " 2151.500000\n", + " 1\n", + " in december 2019 a novel coronavirus sarscov2 ...\n", " \n", " \n", - " 50%\n", - " 162.000000\n", - " 3426.000000\n", + " 2\n", + " the 2019ncov epidemic has spread across china ...\n", " \n", " \n", - " 75%\n", - " 220.000000\n", - " 5344.000000\n", + " 3\n", + " the outbreak of infectious diseases has always...\n", " \n", " \n", - " max\n", - " 1163.000000\n", - " 257301.000000\n", + " 4\n", + " metagenomic sequencing which allows us to dire...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " abstract_word_count body_word_count\n", - "count 8811.000000 8811.000000\n", - "mean 157.014754 4735.074793\n", - "std 97.787300 7538.883651\n", - "min 1.000000 1.000000\n", - "25% 99.000000 2151.500000\n", - "50% 162.000000 3426.000000\n", - "75% 220.000000 5344.000000\n", - "max 1163.000000 257301.000000" + " body_text\n", + "0 vp3 and vp0 which is further processed to vp2 ...\n", + "1 in december 2019 a novel coronavirus sarscov2 ...\n", + "2 the 2019ncov epidemic has spread across china ...\n", + "3 the outbreak of infectious diseases has always...\n", + "4 metagenomic sequencing which allows us to dire..." ] }, - "execution_count": 59, + "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid.describe()" + "bodytexts = df_covid2.drop([\"paper_id\",\"abstract\", \"abstract_word_count\", \"body_word_count\", \"authors\", \"title\"], axis=1)\n", + "bodytexts.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We now clean the text data so that our key words are clear and we do not have to worry about details we are not concerned about. [ref1](https://towardsdatascience.com/nlp-text-preprocessing-a-practical-guide-and-template-d80874676e79) [ref2](https://www.geeksforgeeks.org/text-preprocessing-in-python-set-1/)" + "### TODO: Clean first word dirty data" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 148, "metadata": {}, + "outputs": [], "source": [ - "Firstly By now looking at the abstract columns, we must notice that the text scraped from these articles is not perfect.\n", - "\n", - "For example, many of the abstracts start with the word *Abstract* glued to the beginning of the first word of the real abstract body. " + "tempdf = pd.DataFrame(abstracts['abstract'].apply(lambda x: (str(x).split())[0] if x != \"\" else x))\n", + "#print(tempdf.to_string())" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 149, "metadata": {}, "outputs": [ { @@ -4439,701 +3211,925 @@ " \n", " \n", " \n", - " paper_id\n", " abstract\n", - " body_text\n", - " authors\n", - " title\n", - " abstract_word_count\n", - " body_word_count\n", + " first_word\n", " \n", " \n", " \n", " \n", " 0\n", - " e6d882be4961d1bdd7507b4a29d86b650de0895d\n", - " Middle East Respiratory Syndrome coronavirus (...\n", - " Middle East respiratory syndrome coronavirus (...\n", - " Oh, Myoung-don. Park, Wan Beom...\n", - " (Middle East respiratory syndrome: what we lea...\n", - " 123\n", - " 3907\n", + " the\n", + " 3\n", " \n", " \n", " 1\n", - " d0c4236f95ba2b3d5cdc71fde524b26262f05dd3\n", - " In theoretical physics, there exist two basic ...\n", - " In theoretical physics, there exist two basic ...\n", - " Zhang, Ren. Zhang, Chun-Ting\n", - " (A Brief Review: The Z-curve Theory and its Ap...\n", - " 169\n", - " 6359\n", + " during\n", + " 6\n", " \n", " \n", " 2\n", - " d24afd9ee025b53015824be203db539009964fbd\n", - " BACKGROUND: The aim of this research was to ev...\n", - " In view of the role of thin-section CT scannin...\n", - " Xing, Zhi-Heng. Sun, Xin...\n", - " (Thin-section Computed Tomography Detects Long...\n", - " 326\n", - " 2937\n", + " we\n", + " 2\n", " \n", " \n", " 3\n", - " 4a3941003ea2673397975ae8bc2536ad59f789e5\n", - " INTRODUCTION: The Major Incident Hospital (MIH...\n", - " injury. Medical management does not simply inv...\n", - " Marres, G. M. H.. van der Eijk, J....\n", - " (Evaluation of admissions to the Major Inciden...\n", - " 264\n", - " 4830\n", + " the\n", + " 3\n", + " \n", + " \n", + " 4\n", + " abstrarctthe\n", + " 12\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " abstract first_word\n", + "0 the 3\n", + "1 during 6\n", + "2 we 2\n", + "3 the 3\n", + "4 abstrarctthe 12" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tempdf['first_word'] = tempdf['abstract'].apply(lambda x: len(str(x)))\n", + "tempdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
abstractfirst_word
4905e2b35719215760ebed8f3e93fdf6ea8ec9bcfMice infected with the neurotropic JHM strain ...1. Frequently change and/or sterilize gloves t...Carbajal, Kevin S.. Weinger, Jason G....(Surgical Transplantation of Mouse Neural Stem...1371300abstrarctthe12
5319004f23d1af4357edb2a3862f2619be23a21a6Diagnostics play a central role in the early d...► The Middle East respiratory syndrome-coronav...Kelly-Cirino, Cassandra. Mazzola, Laura T...(An updated roadmap for MERS-CoV research and ...2123632objective9
64971940ab68e4950b28410a208166900e8ba9c07Despite of the role of domestic dogs as reserv...Since its initial identification in the late 1...Vieira, Flávia V.. Hoffmann, Daniel J....(Circulation of canine parvovirus among dogs l...1371519infectious10
7cae1f2fd5785845caa42d6497361eba46a68f11dDiversity and plasticity are two hallmarks of ...Macrophages were first identified by Elie Metc...Liu, Yan-Cun. Zou, Xian-Biao...(Macrophage Polarization in Inflammatory Disea...1114608importance10
82ffddf5caaef38207b58710a93ee8361518813c9This commentary offers suggestions for improvi...I n the early 20th century, the new field of m...Kahn, Laura H.(The need for one health degree programs, 40)5611779backgroundnipah15
94bae83e2441c3738d96e49c21c9be0a4c85b4a92The protective efficacy of DNA plasmids encodi...Avian infectious bronchitis (IB) is a major di...Yan, Fang. Zhao, Yujun...(Protection of chickens against infectious bro...179298710background10
11background10
12introduction12
16posttranscriptional19
17abstractduring14
20abstracthuman13
21abstractribosomal17
23background10
25summarythe10
27herewith8
29background10
30abstractthe11
31background10
33background10
34abstractmotivationlongread26
35abstractto10
362019ncov8
45background10
46infection9
53summarymany11
57background10
58quantification14
60background10
62background10
63motivationunderstanding23
.........
1861polyomaviruses14
1863introduction12
1864background10
1865background10
1866traditionally13
1868background10
1869background10
1872enterovirus11
1875motivation10
1876abstractnew11
1877previously10
1878sarscov28
1881abstractzika12
1883background10
1887abstractthe11
1889background10
1892objectives10
1894background10
1899coronavirus11
1902abstractschizophrenia21
1908influenza9
1909background10
1910introductioncryptococcosis26
1911abstractformation17
1913abstractpositivesense21
1922introduction12
1924background10
1925abstractcurrent15
1926abstractextracellular21
1931summarycoronaviruses20
\n", + "

919 rows × 2 columns

\n", "
" ], "text/plain": [ - " paper_id \\\n", - "0 e6d882be4961d1bdd7507b4a29d86b650de0895d \n", - "1 d0c4236f95ba2b3d5cdc71fde524b26262f05dd3 \n", - "2 d24afd9ee025b53015824be203db539009964fbd \n", - "3 4a3941003ea2673397975ae8bc2536ad59f789e5 \n", - "4 905e2b35719215760ebed8f3e93fdf6ea8ec9bcf \n", - "5 319004f23d1af4357edb2a3862f2619be23a21a6 \n", - "6 4971940ab68e4950b28410a208166900e8ba9c07 \n", - "7 cae1f2fd5785845caa42d6497361eba46a68f11d \n", - "8 2ffddf5caaef38207b58710a93ee8361518813c9 \n", - "9 4bae83e2441c3738d96e49c21c9be0a4c85b4a92 \n", - "\n", - " abstract \\\n", - "0 Middle East Respiratory Syndrome coronavirus (... \n", - "1 In theoretical physics, there exist two basic ... \n", - "2 BACKGROUND: The aim of this research was to ev... \n", - "3 INTRODUCTION: The Major Incident Hospital (MIH... \n", - "4 Mice infected with the neurotropic JHM strain ... \n", - "5 Diagnostics play a central role in the early d... \n", - "6 Despite of the role of domestic dogs as reserv... \n", - "7 Diversity and plasticity are two hallmarks of ... \n", - "8 This commentary offers suggestions for improvi... \n", - "9 The protective efficacy of DNA plasmids encodi... \n", - "\n", - " body_text \\\n", - "0 Middle East respiratory syndrome coronavirus (... \n", - "1 In theoretical physics, there exist two basic ... \n", - "2 In view of the role of thin-section CT scannin... \n", - "3 injury. Medical management does not simply inv... \n", - "4 1. Frequently change and/or sterilize gloves t... \n", - "5 ► The Middle East respiratory syndrome-coronav... \n", - "6 Since its initial identification in the late 1... \n", - "7 Macrophages were first identified by Elie Metc... \n", - "8 I n the early 20th century, the new field of m... \n", - "9 Avian infectious bronchitis (IB) is a major di... \n", - "\n", - " authors \\\n", - "0 Oh, Myoung-don. Park, Wan Beom... \n", - "1 Zhang, Ren. Zhang, Chun-Ting \n", - "2 Xing, Zhi-Heng. Sun, Xin... \n", - "3 Marres, G. M. H.. van der Eijk, J.... \n", - "4 Carbajal, Kevin S.. Weinger, Jason G.... \n", - "5 Kelly-Cirino, Cassandra. Mazzola, Laura T... \n", - "6 Vieira, Flávia V.. Hoffmann, Daniel J.... \n", - "7 Liu, Yan-Cun. Zou, Xian-Biao... \n", - "8 Kahn, Laura H. \n", - "9 Yan, Fang. Zhao, Yujun... \n", - "\n", - " title abstract_word_count \\\n", - "0 (Middle East respiratory syndrome: what we lea... 123 \n", - "1 (A Brief Review: The Z-curve Theory and its Ap... 169 \n", - "2 (Thin-section Computed Tomography Detects Long... 326 \n", - "3 (Evaluation of admissions to the Major Inciden... 264 \n", - "4 (Surgical Transplantation of Mouse Neural Stem... 137 \n", - "5 (An updated roadmap for MERS-CoV research and ... 212 \n", - "6 (Circulation of canine parvovirus among dogs l... 137 \n", - "7 (Macrophage Polarization in Inflammatory Disea... 111 \n", - "8 (The need for one health degree programs, 40) 56 \n", - "9 (Protection of chickens against infectious bro... 179 \n", + " abstract first_word\n", + "4 abstrarctthe 12\n", + "5 objective 9\n", + "6 infectious 10\n", + "7 importance 10\n", + "9 backgroundnipah 15\n", + "10 background 10\n", + "11 background 10\n", + "12 introduction 12\n", + "16 posttranscriptional 19\n", + "17 abstractduring 14\n", + "20 abstracthuman 13\n", + "21 abstractribosomal 17\n", + "23 background 10\n", + "25 summarythe 10\n", + "27 herewith 8\n", + "29 background 10\n", + "30 abstractthe 11\n", + "31 background 10\n", + "33 background 10\n", + "34 abstractmotivationlongread 26\n", + "35 abstractto 10\n", + "36 2019ncov 8\n", + "45 background 10\n", + "46 infection 9\n", + "53 summarymany 11\n", + "57 background 10\n", + "58 quantification 14\n", + "60 background 10\n", + "62 background 10\n", + "63 motivationunderstanding 23\n", + "... ... ...\n", + "1861 polyomaviruses 14\n", + "1863 introduction 12\n", + "1864 background 10\n", + "1865 background 10\n", + "1866 traditionally 13\n", + "1868 background 10\n", + "1869 background 10\n", + "1872 enterovirus 11\n", + "1875 motivation 10\n", + "1876 abstractnew 11\n", + "1877 previously 10\n", + "1878 sarscov2 8\n", + "1881 abstractzika 12\n", + "1883 background 10\n", + "1887 abstractthe 11\n", + "1889 background 10\n", + "1892 objectives 10\n", + "1894 background 10\n", + "1899 coronavirus 11\n", + "1902 abstractschizophrenia 21\n", + "1908 influenza 9\n", + "1909 background 10\n", + "1910 introductioncryptococcosis 26\n", + "1911 abstractformation 17\n", + "1913 abstractpositivesense 21\n", + "1922 introduction 12\n", + "1924 background 10\n", + "1925 abstractcurrent 15\n", + "1926 abstractextracellular 21\n", + "1931 summarycoronaviruses 20\n", "\n", - " body_word_count \n", - "0 3907 \n", - "1 6359 \n", - "2 2937 \n", - "3 4830 \n", - "4 1300 \n", - "5 3632 \n", - "6 1519 \n", - "7 4608 \n", - "8 1177 \n", - "9 2987 " + "[919 rows x 2 columns]" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tempdf[tempdf['first_word'] > 7]" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 False\n", + "4 True\n", + "5 False\n", + "6 True\n", + "7 True\n", + "8 False\n", + "9 True\n", + "10 True\n", + "11 True\n", + "12 True\n", + "13 False\n", + "14 True\n", + "15 False\n", + "16 True\n", + "17 True\n", + "18 False\n", + "19 True\n", + "20 True\n", + "21 True\n", + "22 False\n", + "23 True\n", + "24 False\n", + "25 True\n", + "26 False\n", + "27 False\n", + "28 False\n", + "29 True\n", + " ... \n", + "1904 False\n", + "1905 False\n", + "1906 False\n", + "1907 False\n", + "1908 False\n", + "1909 True\n", + "1910 True\n", + "1911 True\n", + "1912 False\n", + "1913 True\n", + "1914 False\n", + "1915 False\n", + "1916 False\n", + "1917 False\n", + "1918 False\n", + "1919 False\n", + "1920 False\n", + "1921 False\n", + "1922 True\n", + "1923 False\n", + "1924 True\n", + "1925 True\n", + "1926 True\n", + "1927 False\n", + "1928 True\n", + "1929 False\n", + "1930 True\n", + "1931 True\n", + "1932 False\n", + "1933 False\n", + "Name: abstract, Length: 1934, dtype: bool" ] }, - "execution_count": 60, + "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).replace('Abstract',''))\n", - "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).replace('abstract',''))\n", - "df_covid2.head(10)" + "pattern = r'[background][summary][abstract]'\n", + "tempdf['abstract'].str.contains(pattern)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let us remove all punctuation from text and change it to lowercase" + "### END TODO" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Data Preprocessing" + "## Forming n-grams" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word Cloud of Abstracts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Vectorize with HashingVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk import word_tokenize\n", + "from nltk.util import ngrams" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "words = []\n", + "for text in abstracts['abstract']:\n", + " for word in text.split(\" \"):\n", + " words.append(word)" + ] + }, + { + "cell_type": "code", + "execution_count": 154, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "imported text preprocessing libraries, time taken:0:00:00.000157\n" + "['the', 'positive', 'stranded', 'rna', 'genomes', 'of', 'picornaviruses', 'comprise', 'a', 'single']\n" ] } ], "source": [ - "start_time = datetime.now()\n", - "# import nltk \n", - "import string \n", - "import re \n", - "end_time = datetime.now()\n", - "print(f'imported text preprocessing libraries, time taken:{end_time-start_time}')" + "print(words[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import HashingVectorizer\n", + "\n", + "# hash vectorizer instance\n", + "hvec = HashingVectorizer(lowercase=False, analyzer=lambda l:l, n_features=2**12)\n", + "\n", + "# features matrix X\n", + "X = hvec.fit_transform(words)" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 491, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
0e6d882be4961d1bdd7507b4a29d86b650de0895dmiddle east respiratory syndrome coronavirus (...Middle East respiratory syndrome coronavirus (...Oh, Myoung-don. Park, Wan Beom...(Middle East respiratory syndrome: what we lea...1233907
1d0c4236f95ba2b3d5cdc71fde524b26262f05dd3in theoretical physics, there exist two basic ...In theoretical physics, there exist two basic ...Zhang, Ren. Zhang, Chun-Ting(A Brief Review: The Z-curve Theory and its Ap...1696359
2d24afd9ee025b53015824be203db539009964fbdbackground: the aim of this research was to ev...In view of the role of thin-section CT scannin...Xing, Zhi-Heng. Sun, Xin...(Thin-section Computed Tomography Detects Long...3262937
34a3941003ea2673397975ae8bc2536ad59f789e5introduction: the major incident hospital (mih...injury. Medical management does not simply inv...Marres, G. M. H.. van der Eijk, J....(Evaluation of admissions to the Major Inciden...2644830
4905e2b35719215760ebed8f3e93fdf6ea8ec9bcfmice infected with the neurotropic jhm strain ...1. Frequently change and/or sterilize gloves t...Carbajal, Kevin S.. Weinger, Jason G....(Surgical Transplantation of Mouse Neural Stem...1371300
\n", - "
" - ], "text/plain": [ - " paper_id \\\n", - "0 e6d882be4961d1bdd7507b4a29d86b650de0895d \n", - "1 d0c4236f95ba2b3d5cdc71fde524b26262f05dd3 \n", - "2 d24afd9ee025b53015824be203db539009964fbd \n", - "3 4a3941003ea2673397975ae8bc2536ad59f789e5 \n", - "4 905e2b35719215760ebed8f3e93fdf6ea8ec9bcf \n", - "\n", - " abstract \\\n", - "0 middle east respiratory syndrome coronavirus (... \n", - "1 in theoretical physics, there exist two basic ... \n", - "2 background: the aim of this research was to ev... \n", - "3 introduction: the major incident hospital (mih... \n", - "4 mice infected with the neurotropic jhm strain ... \n", - "\n", - " body_text \\\n", - "0 Middle East respiratory syndrome coronavirus (... \n", - "1 In theoretical physics, there exist two basic ... \n", - "2 In view of the role of thin-section CT scannin... \n", - "3 injury. Medical management does not simply inv... \n", - "4 1. Frequently change and/or sterilize gloves t... \n", - "\n", - " authors \\\n", - "0 Oh, Myoung-don. Park, Wan Beom... \n", - "1 Zhang, Ren. Zhang, Chun-Ting \n", - "2 Xing, Zhi-Heng. Sun, Xin... \n", - "3 Marres, G. M. H.. van der Eijk, J.... \n", - "4 Carbajal, Kevin S.. Weinger, Jason G.... \n", - "\n", - " title abstract_word_count \\\n", - "0 (Middle East respiratory syndrome: what we lea... 123 \n", - "1 (A Brief Review: The Z-curve Theory and its Ap... 169 \n", - "2 (Thin-section Computed Tomography Detects Long... 326 \n", - "3 (Evaluation of admissions to the Major Inciden... 264 \n", - "4 (Surgical Transplantation of Mouse Neural Stem... 137 \n", - "\n", - " body_word_count \n", - "0 3907 \n", - "1 6359 \n", - "2 2937 \n", - "3 4830 \n", - "4 1300 " + "(429481, 4096)" ] }, - "execution_count": 62, + "execution_count": 491, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).lower())\n", - "df_covid2.head()" + "X.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Separete Training and Test Set" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 157, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
paper_idabstractbody_textauthorstitleabstract_word_countbody_word_count
0e6d882be4961d1bdd7507b4a29d86b650de0895dmiddle east respiratory syndrome coronavirus (...middle east respiratory syndrome coronavirus (...Oh, Myoung-don. Park, Wan Beom...(Middle East respiratory syndrome: what we lea...1233907
1d0c4236f95ba2b3d5cdc71fde524b26262f05dd3in theoretical physics, there exist two basic ...in theoretical physics, there exist two basic ...Zhang, Ren. Zhang, Chun-Ting(A Brief Review: The Z-curve Theory and its Ap...1696359
2d24afd9ee025b53015824be203db539009964fbdbackground: the aim of this research was to ev...in view of the role of thin-section ct scannin...Xing, Zhi-Heng. Sun, Xin...(Thin-section Computed Tomography Detects Long...3262937
34a3941003ea2673397975ae8bc2536ad59f789e5introduction: the major incident hospital (mih...injury. medical management does not simply inv...Marres, G. M. H.. van der Eijk, J....(Evaluation of admissions to the Major Inciden...2644830
4905e2b35719215760ebed8f3e93fdf6ea8ec9bcfmice infected with the neurotropic jhm strain ...1. frequently change and/or sterilize gloves t...Carbajal, Kevin S.. Weinger, Jason G....(Surgical Transplantation of Mouse Neural Stem...1371300
\n", - "
" - ], - "text/plain": [ - " paper_id \\\n", - "0 e6d882be4961d1bdd7507b4a29d86b650de0895d \n", - "1 d0c4236f95ba2b3d5cdc71fde524b26262f05dd3 \n", - "2 d24afd9ee025b53015824be203db539009964fbd \n", - "3 4a3941003ea2673397975ae8bc2536ad59f789e5 \n", - "4 905e2b35719215760ebed8f3e93fdf6ea8ec9bcf \n", - "\n", - " abstract \\\n", - "0 middle east respiratory syndrome coronavirus (... \n", - "1 in theoretical physics, there exist two basic ... \n", - "2 background: the aim of this research was to ev... \n", - "3 introduction: the major incident hospital (mih... \n", - "4 mice infected with the neurotropic jhm strain ... \n", - "\n", - " body_text \\\n", - "0 middle east respiratory syndrome coronavirus (... \n", - "1 in theoretical physics, there exist two basic ... \n", - "2 in view of the role of thin-section ct scannin... \n", - "3 injury. medical management does not simply inv... \n", - "4 1. frequently change and/or sterilize gloves t... \n", - "\n", - " authors \\\n", - "0 Oh, Myoung-don. Park, Wan Beom... \n", - "1 Zhang, Ren. Zhang, Chun-Ting \n", - "2 Xing, Zhi-Heng. Sun, Xin... \n", - "3 Marres, G. M. H.. van der Eijk, J.... \n", - "4 Carbajal, Kevin S.. Weinger, Jason G.... \n", - "\n", - " title abstract_word_count \\\n", - "0 (Middle East respiratory syndrome: what we lea... 123 \n", - "1 (A Brief Review: The Z-curve Theory and its Ap... 169 \n", - "2 (Thin-section Computed Tomography Detects Long... 326 \n", - "3 (Evaluation of admissions to the Major Inciden... 264 \n", - "4 (Surgical Transplantation of Mouse Neural Stem... 137 \n", - "\n", - " body_word_count \n", - "0 3907 \n", - "1 6359 \n", - "2 2937 \n", - "3 4830 \n", - "4 1300 " - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train size: 800\n", + "X_test size: 200 \n", + "\n" + ] } ], "source": [ - "df_covid2['body_text'] = df_covid2['body_text'].apply(lambda x: str(x).lower())\n", - "df_covid2.head()" + "from sklearn.model_selection import train_test_split\n", + "\n", + "# test set size of 20% of the data and the random seed 42 <3\n", + "X_train, X_test = train_test_split(X[:1000].toarray(), test_size=0.2, random_state=42)\n", + "\n", + "print(\"X_train size:\", len(X_train))\n", + "print(\"X_test size:\", len(X_test), \"\\n\")" ] }, { - "cell_type": "code", - "execution_count": 64, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "str.maketrans?" + "# Dimensionality Reduction with t-SNE" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 158, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Punctuated:\n", - " in theoretical physics, there exist two basic mathematical approaches, algebraic and geometrical methods, which, in most cases, are complementary. in the area of genome sequence analysis, however, algebraic approaches have been widely used, while geometrical approaches have been less explored for a long time. the z-curve theory is a geometrical approach to genome analysis. the z-curve is a three-dimensional curve that represents a given dna sequence in the sense that each can be uniquely reconstructed given the other. the z-curve, therefore, contains all the information that the corresponding dna sequence carries. the analysis of a dna sequence can then be performed through studying the corresponding z-curve. the z-curve method has found applications in a wide range of areas in the past two decades, including the identifications of protein-coding genes, replication origins, horizontally-transferred genomic islands, promoters, translational start sides and isochores, as well as studies on phylogenetics, genome visualization and comparative genomics. here, we review the progress of z-curve studies from aspects of both theory and applications in genome analysis. \n", - "\n", - "\n", - "\n", - "Cleaned:\n", - " in theoretical physics there exist two basic mathematical approaches algebraic and geometrical methods which in most cases are complementary in the area of genome sequence analysis however algebraic approaches have been widely used while geometrical approaches have been less explored for a long time the zcurve theory is a geometrical approach to genome analysis the zcurve is a threedimensional curve that represents a given dna sequence in the sense that each can be uniquely reconstructed given the other the zcurve therefore contains all the information that the corresponding dna sequence carries the analysis of a dna sequence can then be performed through studying the corresponding zcurve the zcurve method has found applications in a wide range of areas in the past two decades including the identifications of proteincoding genes replication origins horizontallytransferred genomic islands promoters translational start sides and isochores as well as studies on phylogenetics genome visualization and comparative genomics here we review the progress of zcurve studies from aspects of both theory and applications in genome analysis\n" + "[t-SNE] Computing 16 nearest neighbors...\n", + "[t-SNE] Indexed 800 samples in 0.084s...\n", + "[t-SNE] Computed neighbors for 800 samples in 3.883s...\n", + "[t-SNE] Computed conditional probabilities for sample 800 / 800\n", + "[t-SNE] Mean sigma: 0.000000\n", + "[t-SNE] KL divergence after 250 iterations with early exaggeration: 72.336159\n", + "[t-SNE] KL divergence after 1000 iterations: 0.633702\n" ] } ], "source": [ - "print(\"Punctuated:\\n\",df_covid2['abstract'][1],'\\n\\n\\n')\n", - "translator = str.maketrans(' ',' ',string.punctuation)\n", - "print(\"Cleaned:\\n\",df_covid2['abstract'][1].translate(translator))" + "from sklearn.manifold import TSNE\n", + "\n", + "tsne = TSNE(verbose=1, perplexity=5)\n", + "X_embedded = tsne.fit_transform(X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# sns settings\n", + "sns.set(rc={'figure.figsize':(15,15)})\n", + "\n", + "# colors\n", + "palette = sns.color_palette(\"bright\", 1)\n", + "\n", + "# plot\n", + "sns.scatterplot(X_embedded[:,0], X_embedded[:,1], palette=palette)\n", + "\n", + "plt.title(\"t-SNE Covid-19 Articles\")\n", + "# plt.savefig(\"plots/t-sne_covid19.png\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Unsupervised Learning: Clustering with K-Means" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "\n", + "k = 10\n", + "kmeans = KMeans(n_clusters=k, n_jobs=4, verbose=10)\n", + "y_pred = kmeans.fit_predict(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "y_train = y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "y_test = kmeans.predict(X_test)" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 163, "metadata": {}, "outputs": [ { "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3UAAANfCAYAAAB+OfRwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3XuclnP+x/HXdd3nOdVMTenkkFKETQoVkiRFaUsOKUoUyqItIishOe2i0jofylIRSUuhFBIVQiEpOsk0NdMc7/N1/f6YXzfTzLDTfY+ZMe/n47GP3ft7X/fn+t5zTbP3+/4eLsO2bRsRERERERGplczq7oCIiIiIiIgcPIU6ERERERGRWkyhTkREREREpBZTqBMREREREanFFOpERERERERqMYU6ERERERGRWkyhTkSkBrviiivIyckp97nCwkJuu+02+vbtS79+/ejfvz8vv/xy7PmhQ4cydOhQLMuKteXk5NCmTZvY4zZt2tC3b1/OP//8Uv/ZsWNHuef87LPPGDFiBOeffz59+/Zl5MiRfPfddwf9/h555BEWLFhQpv3AfpYnFAoxfPhwFi9eHGvbsWMHI0aMoHfv3gwcOJA333zzN2vMnj2bNm3asG7dut88bvny5TzyyCMALF26lLvvvvs3jx86dGipflWlrKwsJkyYEPs9GDRoEO+++27s+TZt2lT4O/R7vvzyS26//fZEdbVc5513Hp988kmVnkNE5M/OWd0dEBGRiq1cubLC5/75z3+SlJTEwoULMQyDrKwsLrroIpo0acKpp54KwLp163jssce49tprK6zz/PPPk5GR8bt9WbNmDePHj2fGjBkce+yxACxcuJChQ4fy1ltv/U81DnT99ddX+jUAn3/+OXfeeSdbtmzhoosuirVPmDCBk08+maeffprCwkIuu+wyWrZsSdu2bcutM2fOHPr27cvzzz9P+/btKzzfV199RV5eHgA9evSgR48eB9XvRMvJyeHiiy/m+uuvZ+rUqRiGwbfffsvw4cPx+Xx07do1rvrff/89WVlZCeqtiIhUFYU6EZEa6pZbbgHg8ssv54knnqBJkyalns/OzqZBgwaEw2HcbjeNGzdm+vTp1K9fP3bMtddey9NPP02XLl1+M7T8L6ZNm8a1114bC3QA/fr1w+PxEI1GAZg7dy6zZ8/GNE0aNmzIP/7xDxo2bEi3bt1YsmQJmZmZAAwaNIgxY8bw1ltv0bp1a0aMGMHbb7/NQw89hM/nK3WO8syePZu///3vPP7446XaN2zYwL333gtASkoKJ598Mu+88065oe6TTz4hLy+P8ePH07NnT3bt2hX7GQ8dOpR69eqxZcsW+vTpw5w5c4hGo6SmpnLYYYexZMkSHn/8cbKzs5k0aRJbtmzBNE0uvvhiLrvsslLn+eyzz3jwwQfx+/2YpsmYMWPo3r072dnZ3HzzzeTm5gLQrVs3brjhhspcEl588UU6dOhA//79Y21t27Zl2rRppKWllTr21VdfjfX7wMdr167l3nvvjY3qjho1iuOPP55p06ZRUFDALbfcwtSpU1m2bBn//ve/CYfDeL1ebr75Zk444QSmT5/OunXr2L17N23atOHBBx/k3//+N2+//TaWZdGsWTMmTZpE48aN+f7777n11lvx+/20bNmS4uLiSr1nEREpS6FORKSGmjp1Kq+++mqFI2ljxozh+uuv55RTTuGEE06gQ4cO9OnThxYtWsSOOeKII7jpppsYN25cudMcoSQ0muYvs/GbN2/Oo48+Wua49evXM2nSpDLtvXr1AmDVqlU89dRTzJ07l4yMDF599VVGjx7Nf//7X3r27MnChQsZMWIEmzdvZs+ePZx22mm89dZbAOzZs4dbb72VOXPm0KpVqzJh7UD/+te/AMocd/zxx/Pqq69y3XXXkZuby/vvv8+JJ55Ybo0XX3yRvn370rhxY0455RReeOEFxo8fH3s+LS0tNn3Ttm1yc3O58cYbefXVV2PHTJ48mcMPP5yZM2dSUFDAJZdcQrdu3WLP5+Xlccstt/D000/TvHlzsrKyuPDCC2nTpg2vvfYazZs355lnnqG4uJiJEydSUFBAamrqb773X1u/fj2nnXZamfZOnTr9zzUApk+fzvDhwzn33HP59ttvmTt3Lr169eJvf/sbS5YsYerUqfz444889NBDzJo1i/T0dDZt2sTw4cN5++23Adi5cyeLFi3C6XSyYMECvvvuO15++WWcTidz587ltttu48knn2TcuHFceumlDBo0iE8//ZRLL720Un0VEZGyFOpERGqptm3bsnjxYjZs2MCaNWtYuXIljz32GI888ghnnnlm7LgLL7yQDz/8kDvuuINbb721TJ3/dfqlaZql1ucd6IMPPqBPnz6xWgMGDGDKlCns2LGDQYMGMXnyZEaMGMH8+fMZOHBgqSD56aefctRRR9GqVSsALrroolhwq4z77ruPqVOn0q9fP5o1a8YZZ5xBIBAoc1x2djZLly5l/vz5APTv35877riD0aNHk5SUBEDHjh1/93wfffRRLAimpqayaNGiUs+vW7eO7OxsRo8eHWszDIONGzdy2mmnMXLkSHbt2kWXLl34+9//XqlAt7+WbduVek15evfuzZ133smyZcvo0qULY8eOLXPMypUr2b17N8OGDSt1/m3btgHQvn17nM6SjxXvvfceX331FQMHDgTAsiz8fj+5ubls3LgxNrJ44okn0rp167j7LyJS1ynUiYjUAl999RW33XZb7PH8+fO58847GTt2LMceeyzHHnssw4cPZ+bMmcydO7dUqAO466676NevHwsXLjzoPrRv354vvviCo446qlT75MmT6dmzZ7mBz7ZtIpEIHTt2JBKJ8OWXX7Jo0SLmzp1b7rH77Q8HABMnTmT9+vUAXHzxxVxyySUV9jEQCDB16tRYMPvHP/4RC4q/Nm/ePACuueYaoCR0FBYW8tprr8VGjvbX+C1OpxPDMGKPt2/fTnp6euxxNBrlyCOPLLWBTVZWFhkZGbhcLpYuXcqqVav4+OOPGTRoEE8++WSpqadLly5l2rRpADRq1Ignn3yy1Pnbt2/PunXrGDJkSKn2OXPm4Pf7GT58eKztwAAYDodj//viiy+me/furFy5kg8++IAZM2aU2ejFsiw6d+7Mww8/HGvbtWsXjRo14p133in187IsiyuvvJLBgwcDJZva7F+TCBVfaxEROTja/VJEpAZzOBxEIhGOO+44Xn/99dh/nE4nP/zwAzNnzox9OI9EImzevJljjjmmTJ169erxwAMP8NBDDx10X6655hpmzJgRC1jwy7qso446itNOO40333wzttPi/PnzqV+/PocddhhQso7urrvuok2bNmXWB3bq1Invv/+eb7/9NlZ3vylTpsTe928FOiiZRvjSSy8B8MMPP7Bs2TLOPvvsUsdEo1FefvllJk+ezLJly1i2bBnLly9n1KhRzJo1q9yRr/3X4UCdO3eOjfYVFBRw+eWX8+OPP8aeb9++PVu3bmXNmjUAfPPNN/Tq1YusrCwefPBBZs6cyVlnncXEiRNp1aoVmzZtKlW/R48esfd+YKCDkhHN1atXs3Dhwli/169fz7Rp08qE74yMDDZt2kQwGCQcDrNkyZLYcxdffDHffPMNAwYM4K677iI/P5/s7OxS77tz586sXLmSzZs3A7BixQr69etX7kjoqaeeyiuvvEJhYSFQssvpTTfdRHp6Ou3atYuF3A0bNsS1e6qIiJTQ12MiIjXYOeecw9ChQ5k+fXqZD+mPPPIIDzzwAL169cLn82FZFj179iw11e/XTjrpJIYNG8Zjjz1Wqv3ANXUAY8eOLbU2DEqmI959991MmTKF4uJiwuEwhx56KLNmzaJhw4Y0bNiQYcOGcfnll2NZFhkZGTz++OOx2v379+df//pXudMqMzIyePDBBxk3bhwul6vSa8L2u+mmmxg/fjwLFizA4XBw7733lgmQ7733HpZl0bdv31Ltw4YNY9asWaxYsaJM3VNOOYVx48Zx11130a5du1j77bffzh133EHfvn2xbZtRo0aVGmnLyMhg2rRp3H///QSDQWzb5v7776d58+ZcfvnlTJgwgfPOOw+3202bNm0499xzK/V+69evz+zZs3nggQdiP2ufz8eUKVPK7HzZtWtXOnXqRO/evcnMzOTkk09m48aNAIwbN4577rmHhx9+GMMwGDNmDM2bNycajfLoo48yZswYZsyYERsdtm0bp9PJv//9b5KTk8v0a9CgQbH1g4Zh0KRJk9gGNv/617+45ZZbmDNnDoceeigtW7as1HsWEZGyDDsRk/FFRERERESkWmj6pYiIiIiISC2mUCciIiIiIlKLKdSJiIiIiIjUYgp1IiIiIiIitZhCnYiIiIiISC2mUCciIiIiIlKL1bj71OXmFmFZ1X+XhQYNUti7t7C6uyG/omtS8+ia1Cy6HjWPrknNo2tS8+ia1Dy6JtXHNA3S08ve//P31LhQZ1l2jQh1QI3ph/xC16Tm0TWpWXQ9ah5dk5pH16Tm0TWpeXRNahdNvxQREREREanFFOpERERERERqsRo3/VJERERERKQybNsmNzebUCgA1IapowZut5f09EwMw4i7mkKdiIiIiIjUaoWFeRiGQePGzTGMmj8Z0bYt9u3bQ2FhHqmp9eOuV/PfsYiIiIiIyG/w+wtJTa1fKwIdgGGYpKam4/cnZpfR2vGuRUREREREKmBZURyO2jUJ0eFwYlnRhNRSqBMRERERkVovEWvT/kiJ7K9CnYiIiIiISIK9/fZihgwZxMUX/5X58+dV6blq1xiliIiIiIhIDZedvZsnn5zJ00/PxuVyc/XVV9ChQ0eOOKJllZxPI3UiIiIiIlInzX/fScerk2kyKIWOVycz//3EjHmtXbuaDh06kpZWD5/PR/fuPVi+fGlCapdHoU5EREREROqc+e87GfeYlx17TGzbYMcek3GPeRMS7PbsyaZBg4axxw0aNGT37t1x162IQp2IiIiIiNQ5U1/04A+V3qzEHzKY+qIn7tqWZZXaCMW2bUyz6jZyUagTEREREZE6Z+fe8kNWRe2V0ahRY/bu3RN7nJOzl4YNM+OuWxGFOhERERERqXOaNbAr1V4ZHTuexKefriE3N5dAIMDy5cs4+eTOcdetiHa/FBERERGROueWwUHGPeYtNQXT57a5ZXAw7tqZmY246qpr+dvfRhEOR+jb93yOOebYuOtWRKFORERERETqnIGnR4AAU1/0sHOvQbMGJYGupD1+Z599DmeffU5Cav0ehToREREREamTBp4eSViIq05aUyciIiIiIlKLKdSJiIiIiIjUYgp1IiIiIiIitZhCnYiIiIiISC2mUCciIiIiIlKLKdSJiIiIiIjUYgp1IiIiIiIiCVZUVMjQoReya9dPVX4uhToREREREamTPBvnkfFsOxpOq0fGs+3wbJyXkLobNqzn2muvZPv2bQmp93sU6kREREREpM7xbJxH6tLrcBRsx8DGUbCd1KXXJSTYvfHGa4wdezMNG2YmoKe/z/mHnEVERERERKQGSf5oMkbEX6rNiPhJ/mgywTYXxlV7woR/xPX6ytJInYiIiIiI1DlmwY5KtddkCnUiIiIiIlLnWKnNK9VekynUiYiIiIhInVPUZRK201eqzXb6KOoyqZp6dPAU6kREpMo5HAZej4nb7ajuroiIiAAQbHMhBT2mE01tgY1BNLUFBT2mx72erjpooxQREalSKV4/nuh3GD8/he1rh93wUvYVp2FZdnV3TURE6rhgmwurNMS98sobVVb71xTqRESkyrhcJp7idzG/vxQAAyD7GdLaLmVfUWq19k1EROTPQtMvRUSkyvic+zB3Tind6P8WM/IzhlE9fRIREfmzUagTEZGEcThMTPPXac0Ao5xJIYbW1omIiCSKQp2IiMTNadjUD+TjXb6UlK3fU9+fR3KoiJCdgdXi7tIHp5xI1GyErSV1IiIiCaE1dSIiEhfTNEjO2U3+5SNx9+qBGQySf80NACTdcSvhjp1wHv8Fxt6XwNcOK60HBcWpgFKdiIhIIijUiYhIXJLDxRj5+SRdNRxnu6PJOblb7Ln8PgNIX/sh+S1bY6bfjmXZRAqjKNCJiIgkjqZfiojIQXObYC9bTk7H0wi9sxT/cy+UOSbw9HOk2mHC4QiRSLQaeikiIvLHe+aZJxgy5EKGDLmQmTMfqdJzKdSJiMhB8xYXUHD9TQBYe/biaN60zDFm06YEZj6BLxr6o7snIiJSLdas+YQ1az7m2Wf/w3PPvcjGjd+yYsV7VXa+uELd66+/zrnnnsu5557LfffdB8A333zDgAED6NWrFxMnTiQSiSSkoyIiUvMYBti5uQCEli7HdcZpONocFXvecVRrvJcMouju+3CFFerktzncBiFvADxRjF/d88LpMrA9UZwufRctIon1ZsFSem8bwgk/9KL3tiG8WbA0IXUbNGjI6NE34nK5cDqdHHbY4WRl/ZyQ2uU56L+Ofr+fKVOmMHv2bF5//XXWrl3LRx99xPjx47n99ttZsmQJtm0zb968RPZXRERqkLDLg+fiC0oeWBb5Q64k7cnppC97k3qvvUTq9AcJr16Lp9+5WNXbVanhrJQQc4oWMGLreO7a/QjFyfk4nSZWSoiXixcx/ue7WRhYgpWiLwdEJDHeLFjKnXsfZld0NzY2u6K7uXPvwwkJdi1bHsmxxx4HwPbt21i27F06d+4ad92KHPRGKdFoFMuy8Pv9JCUlEYlEcDqdBAIB2rdvD8CAAQOYNm0agwcPTliHRUSk5vA73NS/724czZoSWv4Bro4dwOUi//qbsHfuwtq9G+/gC0m++3YKk9MgomgnJVwuB7ZpYUcMbGeUJ7Pn8OjPJWsyvyreyKrCz3i97RNM3PYgb+/7EIB3963kswYbmNhoDEZAe72JSHym5z5LwA6WagvYQabnPkuf1B4JOceWLZu56aYbGD36elq0ODQhNctz0H8RU1JSuP766+nduzc+n49OnTrhcrnIzMyMHZOZmUlWVlal6jZokHKwXUq4zMzU6u6CHEDXpObRNalZquN6WAUGRvOm+EZcjuPoo9jXsx92YWHseXfPM3Eefijpbvcf3reaQP9GysoO7+XF7LdYX/wdF2WeSxtvS+btebPUMT+FdpNnFbA875NS7a/tXcItLa7mkNT0gz6/rknNo2tS89S2a7J7t4nTWblJiD9Hsytsr2yt8nzxxTpuuWU8N944jp49e5V7jGmaCflZH3So+/bbb5k/fz7vvfceqampjBs3jpUrV5aaA2/bdqnH/4u9ewuxrOrf6jozM5Xs7ILq7ob8iq5JzaNrUrNU1/UwDEgb+Ff8t9+F2bgRnkH9CTz/Itg2nkF/xdGrJ3vygkDwd2v92ejfSFmR5ABDNt/It/7NALye8w73HDaOvzY4myey5pQ61md6y7zebbqwovZB/1x1TWoeXZOapzZeE8uyiFRyNsghjkx2RXeX217ZWgfKyvqZm28ey+TJUznxxE4V1rMsq9TP2jSNgxrkOuhQ9+GHH9K5c2caNGgAlEy1fPrpp8nO/iXx7tmzh0aNGh3sKUREpBawbch3J+O55y4I+PF1OpGkO27DtmwiHg/5Lp9uSycx+XZBLNDtN3PXC/znqId4bvd8QnYYgMszLyCZJOa1ncEXRd8we/drfB/YypRD/w4YhJL9uKNujKADW79fInIQrksfzp17Hy41BdNreLgufXjctV966QWCwRDTpz8Ua+vffwD9+18Qd+3yHHSoa9u2LQ888ADFxcX4fD6WLVvGSSedxJIlS/j000858cQTef311zn99NMT2V8REamBbBsChouAz1XS4Pr1k9XSJamhHDjKPB55yMV4TA9L2j2Pw3BgWJDk9DFj1yzeyF3KoZ6mPNzyH2SY9cmzCrhi83h+CGznnPQzuKXZNTgLPQp2IlJp+9fNTc99lp+j2RziyOS69OEJWU93ww3juOGGcXHX+V8ddKg79dRT+frrrxkwYAAul4vjjjuOkSNH0rNnT2677TYKCwtp164dl112WSL7KyIiIrVYipFMl9QOfFTwGQA3Nx9FdjiHzl8OxMamqbsRL7d5lBd2L+Cp3XMByArvYfDGG1h67Atc+M0YCqJFALyy901choObG46GoG53ICKV1ye1R8I2RalOcW0dNXLkSEaOHFmqrW3btrzyyitxdUpERET+fKK+EHutXKYcNo6N/h/YHPiRXumn0e2rS2LH/BTazV3bp9Mx5bhSrw3bEbLCe2OBbr+3933IjY1H4CbpD3kPIiI1kfYDFhERkSpneyLMzp3Pw7ueBeDYpKOY2Gw0Pwf3lDl2o38LFzTsU6otaIVo4KyPgYH9qzm9rXyHYdr6OCMidZvmKoiIiEiVC7tCPPrz7Njj9cXfMWrzRI7wNcdjlL7dRY/6XWnpbUE9R8k23w2c6UxvOYlUkrmt+RjM///4kuGsz9RDb8IT8vxxb0REpAbSV1siIiJS5SwswnakVFuhVYwbNy+1mcaErffxUzCLPhnd6ZV+Ondum84zre8jzZGKaRi8lL2IncEs/trgbP7aoBe50TySTC9hK0rQG8BVrM1SRKTuUqgTERGRKueKujg97STez18da7v3sJvYGtrJf3OXMbH5aI70Hsr64u8Y9t14iqxitoV2cs9h4xi88UYsSu7x9Mzul3njmCe54rvxbA39BEDn1BOYcfidOAo1YicidZNCnYiIiFQ5Z8DLw4ffzlfF35Br5ZNmpHKYrylnrR+Kjc3TWfMwMJjXdjo+00ORVcy56WfydNbLsUAH8HM4m48L1uFz+GJtqwo+Z4P/O05w/IVoNL4bBouI1EZaUyciIiJVzjQhaAd5a98KXsp+g1RnEnOyF5Xa9MTG5s2c5ZxarxMAXtNdbq2oHS3TtiWwHdM0qqbzIiIH4amnHmPIkEEMGXIhc+a8UKXnUqgTERGRKhfwFdP/21HM2bOITwrW8e+f/0OmK73Mcc08h3BDk+E81/oButfrzJgml8U2RgE4xJVJp9Tj+da/OdZmYnJm/c5EImXDnojIb7Hmv4rVoRNW46Yl/z3/1YTU/fzzT/n00zU899xLPP30LObPn8e2bT8mpHZ5NP1SREREqpRhlEybzAr/cvuC9/NXM775SJrtfo2doZ8BaOpuTLe0k+j99TDchpu8aAFDMvvz7rGzmZP9BpmuBvRr0AOP4eG+w27miaw5JDt8TGw2mtRwmjZKEZFKsea/CmPHgd9f0rBjB4wdhwWYAwfEVfuEE05k+vTHcTqd7NmTTTQaxev1/f4LD5JCnYiIiFQp24YUR3Kptogd5fFdL/H60Y/zdfH3AByddCTXbr4dvxXETxCAF7IXMLLRxQxvNIh7dszk/p2PY9vwVKupvNTqEQzLxBdOIhLWWjoRqaQpU38JdPv5/SXtcYY6AKfTydNPP85LL82me/ezyMxsFHfNimj6pYiIiFS5NFI4u/7psccpZhITmo/i3X0reTN3OVuDOzExaes9MnaM23DhNT14HR6u3DyBN3KXErYjRIhw5eYJ2Aa4/F4iEQU6ETkIO3dWrv0gjBgxikWL3mX37iwWLnwtYXUPpJE6ERERqXJOv4d7mo9n9CFD2Bb8ic6pHXj051k8u/uV2DELct7msSPvpoW3CaektccfDdDC0xQDg83+baXqRewofsuPh6Q/+q2IyJ9Fs2YlUy7La4/T1q0/EgoFad26DV6vl9NP787mzZvirlsRjdSJiIhIlbNtcBZ5OCJ8BN2dp2LbNi9mLyx1zKeF6wlbEVp7D2fQN2O4aOPf6P7VYHKj+fSs37XUsQ2c6SSbCnQiEoeJt4DvgHVuPl9Je5x++mkH9903hVAoRDgc5sMPV3D88e3jrlsRjdSJiIjIH8aybCzLBhuSHD6CkVDsOQMD0zAZ++PdBO2S9qAd4prv/8HcttNwGE6W5q3kKN8R3H/YLXgCPjTxUkQOljlwQMnfkClTS6ZcNmsGE2+Je5MUgM6dT+XrrzdwxRWXYpom3bqdyVln9Yq7bkUU6kREROQP5w37uK35aP7+4z2xtqGZf8XEIDeSX+rYzcGtWJbFpMY3cmvT0ZiWiSfo043GRSRu5sABCdkUpTwjRoxixIhRVVL7QAp1IiIi8oeLhuAMX1eWtvsPH+V/ynFJbWnubIIdtTnG14qv/d/Hjj0uqQ2m5cQIOHBTMlUqqjE6EZEYrakTERGRauEIuGjoz2SA71xaRo7AWezBG/TxdKv7OaPeKaQ6kjmzXheePHIq7oC3ursrIlJjaaROREREqo1tQzgcjT2ORm2Si1P5Z9PbiDoiOKNOnMWeknV4IiJSLoU6ERERqVEsy8YRcOPAXfIYBToRkd+i6ZciIiIiIiK1mEKdiIiIiIhILaZQJyIiIiIiUgVmzHiYKVPuqPLzKNSJiIiIiIgk2Nq1q1m8eNEfci5tlCIiIiIiInXSlneDfPZUgKLdNsmNDDpc6aXlWZ646+bn5/HEEzMZOnQ433+/KQE9/W0aqRMRERERkTpny7tBPnrQT1GWDTYUZdl89KCfLe8G4659//33MHLktaSmpiWgp79PoU5EREREROqcz54KED0gv0WDJe3xeOONBTRu3JiOHU+Kq05laPqliIiIiIjUOUW7y78HZkXt/6ulS99m7949DBs2mPz8PPx+P9Om/ZO//e3vcdX9LQp1IiIiIiJS5yQ3MkqmXpbTHo+HH54Z+99vvvkGn3/+aZUGOtD0SxERERERqYM6XOnFccCeKA5PSXtto5E6ERERERGpc/bvclkVu1/u16dPX/r06ZuwehVRqBMRERERkTqp5VmehIa46qLplyIiIiIiIrWYQp2IiIiIiEgtplAnIiIiIiJSiynUiYiIiIiI1GIKdSIiIiIiIrWYQp2IiIiIiEgtplsaiIiIiIiIJNh1140iNzcXp7Mkco0ffyvt2h1bJedSqBMRERERkTpp50fz2PjyZPx7d+Br0Jw2gybRrMuFcde1bZvt27fxyitvxEJdVVKoExERERGROmfnR/P46pnriIb8APj3buerZ64DiDvYbdu2FYCxY8eQl5dHv379GTjwovg6/Bu0pk5EREREROqbmCMKAAAgAElEQVScjS9PjgW6/aIhPxtfnhx37YKCfE48sRP33PMgjzzybxYsmM+aNR/HXbciGqkTEREREZE6x793R6XaK+PYY4/n2GOPjz0+77zzWbVqJZ06nRJ37fJopE5EREREROocX4PmlWqvjC++WMfatatjj23brtK1dQp1IiIiIiJS57QZNAmH21eqzeH20WbQpLhrFxYWMHPmIwSDQYqLi3jrrf9y+und465bEU2/FBERERGROmf/ZihVsftl166n8fXX67niikuJRi0GDBhUajpmoinUiYiIiIhIndSsy4UJCXHlueqqa7jqqmuqpPaBNP1SRERERESkFlOoExERERERqcUU6kRERERERGoxhToREREREZFaTKFORERERESkFlOoExERERERqcV0SwMREREREZEE+/DD93n22ScJBPx06nQKN9wwrsrOpZE6ERERERGRBNq5cwcPPjiVqVMf5Pnn5/Ddd9+yatXKKjufRupERERERKROMlfl4Hz1J9gbhgYuIgOaYnXOiLvu++8vp0ePnjRq1BiAO++citvtjrtuRRTqRERERESkzjFX5eB8fhtGyC5p2BvG+fw2IhB3sNu5cztOp4ubb76RrKwsunQ5lauuuib+TldA0y9FRERERKTOcb760y+B7v8ZIbtk5C5O0WiUtWtXM2HC7Tz++DN8/fV63nprUdx1K6JQJyIiIiIidc/ecOXaKyEjowEdO55Eeno6Ho+X00/vztdfb4i7bkUU6kREREREpO5p4KpceyV06XIaq1evoqCggGg0yscff0Tbtm3jrlsRrakTEREREZE6JzKgaek1dYDtNogMaBp37XbtjmXw4Mu49toRRCIROnU6mT59+sVdtyIKdSIiIgnmdJp47CIIOTBNA8uyf/9FIiLyh7I6ZxCBKtn9EuC8887nvPPOT0it36NQJyIikkD1vFGc+77BfP9ucKeS3v0OCpyHEIpUd89ERORAVucMQgkKcdVJa+pEREQSwOkwyHDl41r3JOaGeXDqzWBbmI91JM3OJs1ZVN1dFBGRPymN1ImIiCRAmpmH4/GOUPhzScPH0+Dyt2HrBxib38UV9pN09FCKI/EvwBcREfk1jdSJiIjEyek0MX5Y/kugA4iGYc3jcMxAyDgSc/tHeCiutj6KiMifl0KdiIhIItjlLJqzo1DvULAi0OUGDLRhioiIJJ5CnYiISJwiEQujZXdIavBLo+mAruOh3QBYeDX2nk2EzJTq66SIiPxpaU2diIhIAoQND+7L34av5kAwH/4yFNa/DG37QsFPkHrI/6+n02idiIgklkKdiIhIAhRbKTgLszFzfwCXD176KzQ5ARodAw1aEW14HFZYgU5EpC54440FzJ8/L/Z4166d9OrVh7Fjb66S8ynUiYiIJEA4YlHc8ER8PQ7H3LQYjr8UI7UJ0b0/YA99h7xoKhqlExGpWX76aTubNn1DIODH6/XRuvXRNG3aIu66ffv2p2/f/gBs2bKZW28dxxVXjIq7bkUU6kRERBLEH/UQMJriPOYqLMsiIyOFfM+RRMIWCnQiIjXLTz9tZ8OGL7CsKACBgJ8NG74ASEiw2++f/7yXkSNHU79+/YTVPJA2ShEREUkg24ZwOEo0WhLiIhGrmnskIiLl2bTpm1ig28+yomza9E3CzrFmzScEg0HOPPOshNUsj0KdiIiIiIjUOYGAv1LtB+P111/loosGJ6xeRRTqRERERESkzvF6fZVqr6xwOMy6dZ9x6qndElLvtyjUiYiIJJhpGtRzFkDOZjKcuSQ7A9XdJREROUDr1kdjmo5SbabpoHXroxNSf/PmTbRocSg+X2JC4m/RRikiIiIJZBhQ35GHY3Yv2L0Bh2Hg7TQa89SJFESSq7t7IiLy//ZvhlIVu18C7Ny5k0aNGiWk1u9RqBMREUkgjyOK8eF9sHtDSYNtY66egbvDcAzPUdi2dsEUEakpmjZtkdCdLn+tR4+e9OjRs0pqH0jTL0VERBLIaQcwd31a9ondG3A4jD++QyIi8qenUCciIpJAITMF6+iBpRsNA7tFZ93eQEREqoRCnYiISAKFwjbR4wZjnzwGXD6o1wLrovn4zaq76ayIiNRtWlMnIiKSYPtCyfi6Tial20RC4Sh+ox6hiNbSiYhI1dBInYiISBXwR1yQegh50TQFOhERqVIKdSIiIiIiIrWYpl+KiIiIiIgk2JIlbzJ79nMAnHJKF8aMuaHKzqWROhERERERkQQKBAI8/PCDzJjxBM899yJffPE5a9Z8UmXnU6gTEREREZE6yZM/j4wt7Wj4XT0ytrTDkz8vIXUtK4ptWwQCfqLRCNFoBI/Hk5Da5dH0SxERERERqXM8+fNIzboOw/YD4IhsJzXrOgCCaRfGVTspKZkrr7yawYMvwOv10r59B4477i9x97kiGqkTEREREZE6J3nP5Fig28+w/STvmRx37e+/38R//7uQ+fPfYMGCtzBNk5demh133Yoo1ImIiIiISJ1jRnZUqr0yVq9exYknnkR6egZut5s+ffry+eefxl23Igp1IiIiIiJS51jO5pVqr4xWrY5i7drV+P1+bNtm5cr3adv2mLjrVkRr6kREREREpM4pajip1Jo6ANvwUdRwUty1TzrpFL777ltGjBiC0+nk6KPbMWTIsLjrViSuULds2TJmzJiB3++na9eu3HbbbXz00UdMnTqVYDBI7969ufHGGxPVVxERERERkYTYvxlK8p7JmJEdWM7mFDWcFPcmKfsNGTKsSoPcrx10qNu+fTuTJk3i5ZdfpkGDBlx++eWsWLGCSZMmMXv2bJo0acKoUaNYsWIF3bp1S2SfRURERERE4hZMuzBhIa46HfSaunfeeYc+ffpwyCGH4HK5eOihh/D5fBx22GG0aNECp9NJ3759Wbx4cSL7KyIiIiIiIr9y0CN1W7duxeVycfXVV7Nr1y7OOOMMWrduTWZmZuyYRo0akZWVlZCOioiIiIiISFkHHeqi0Shr165l9uzZJCUlcc011+D1ejEMI3aMbdulHv8vGjRIOdguJVxmZmp1d0EOoGtS8+ia1Cy6HjWPrknNo2tS8+ia1Dy17Zrs3m3idNa+jf1N00zIz/qgQ13Dhg3p3LkzGRkZAJx11lksXrwYh8MROyY7O5tGjRpVqu7evYVYln2w3UqYzMxUsrMLqrsb8iu6JjWPrknNoutR/VJcYbwUghXGNt2Y9VvomtQw+ndS8+ia1Dy18ZpYlkUkYlV3NyrNsqxSP2vTNA5qkOug42z37t358MMPyc/PJxqN8sEHH3DOOefwww8/sHXrVqLRKIsWLeL0008/2FOIiIjUGmmuYrw/vonx6PEYD7XCnHsB7NuKw1H7vjkWEZHa5aBH6v7yl79w5ZVXMnjwYMLhMF27duWSSy6hZcuWXHfddQSDQbp168Y555yTyP6KiIjUSG7bj/HaMLCiJQ07VsPi8aT0nk5eNKla+yYiIn9ucd2n7oILLuCCCy4o1da5c2cWLlwYV6dERERqE9M0YN+PvwS6/bavwhn1Awp1IiJ1zezZz/Hmmwtxu92ceWZPLr98RJWdS3NCRERE4mRZNtQ/HExH6ScO7UrEoUAnIlJTzQ846ZCbTOO9KXTITWZ+IK4xr5g1az7h3XcX89RTs3jmmf/w9dfrWbFiWUJql0ehTkREJAGCJGFf8AL40ksaWnSGcx6gIKJQJyJSE80POBlb5GWHZWJjsMMyGVvkTUiw27RpIyed1Jnk5BQcDgcnn9yF999fHn+nK6BQJyIikgAFER/Fh/bGunYd9tgtRC+cB/Va1IgdnUVEpKwpfg9+St9+zY/BFL8n7tpHHdWW1atXkZ+fRzAY5MMP3ycnZ2/cdSuSmPFFERGROs7rdWAX51BsGGB6CJn1yKjuTv0JGUbJfZ2i0dq3dbmI1Cw7rfLvp11Re2V07HgSvXufx3XXjSI1NY2OHU/i66/Xx123Igp1IiIicfI6whRuWcvKh66gMOtHmnboSZcbngKSq7trfypRX4g8I58tgW20S2mNN+TDCOmjjIgcnGamzY5yAlwzM/4ZFsXFRZxxRg8uvngIAC++OIumTZvHXbcimn4pIiISJyNSyNJJ51GY9SMAP332DmueGIs/N6t6O/YnYnnDPJf7MmduGMyVmyfQ9atBfBn9Gocr/m/URaRumugL4qN0gPNhM9EXjLv2Tz/9xIQJfycSiVBYWMiiRa9z5plnxV23Ivp6S0REJE6BvN1EgsWl2natW0Y05AdTG6UkQtgZ4tFds2OPLSxu2fYAr7d+AlfYV409E5HaaqA3AgSY4vew0zJoZpYEupL2+LRq1ZozzjiTYcMuIRqNctFFgzn++Pbxd7oCCnUiIiJx8qZlYpgO7F/dpy7jyPYYDidon5SECNlhLEqvo9sTztGcIxGJy0BvJCEhrjzDhl3JsGFXVkntA+lPoYiISJxsh5vO1z2Gw1MyYpRyyBF0HvNvkhtW3fqJusZre2jlPaxU24AG5+CKuKqpRyIiNYdG6kREROIUsFNo2qkvf+3Qk2gogMPtw/Y2qO5u/al4g0n8p/XDPLTrGb4q3sjZ9U5laMMBGEVODYaKSJ2nUCci8idgGGDl5FKvqAAMg7DHR7Ej/vvsyP/ObyWBkQT7f+xBi9Rq7dGfSzRq4y1K5paGowkaIXyWl2ihZreKiICmX4qIVDufz4XPF98UsrRQMXlXXEPuEceQ27IdkTvuJDVc/PsvFKnhfNEg9YvzqLd3F2mBAlKKQ9QvsCGqjzAiUppt166veRLZX/1FFBGpJl6vk4xgPo4FCzDnziXDn8fBZDuXy0H4jf8Sen1RSYNlEXj8GdjwNQ6H/sxL7eWLBOD5WeQc2Y7cozuQd8oZmD/+gDX9UVL3/kxqqAinU7/jIgJOp5uiovxaE+xs26aoKB+n052Qepp+KSJSTZL27SWn61lY23cAUJiRQcba9/GnVG4tliMaIbxsRZn28AcrcZx0MtFoOS8SqeEMw8AbCmB1OQXXf+fjf+p5gq+8RtGkKaTceycFV42BUAjfbTcRPaEDfqe3urssItUoPT2T3NxsCgv3VXdX/mdOp5v09MzE1EpIFRERqZSUFA+BOYtigQ7AzsmheMZjpN45iYLi/3175YjpxNPvXILz5pdqd53dg6KIVcGrRGouw4B6wQIKrhpN6J1lGBkZpNw7GbPJIQRffZ3INxsJr/gAgPC5A6m/6j2CrY/GsmrHN/QikngOh5OGDZtUdzeqjeYsiIhUA8MAOye3TLudsw/DqlwQi0SiOM7shu+6q8HtxkhOJvnu27FbHqkPuVIreewIxXffR+idZZhNm+D9a1/C772Pd8hFuPv2wWx6COnLF1Nv7ixcnU8m8NhTuBxGdXdbRKTaaKRORKQaFBUFqXfphRTd8wCEQiWNhkHSmKspilb+w2m+M4kG99yBd/yNAAS9yRTiSGSXRf4wTn8xRctW4Lv6SryD/or/hTkYmZmY9euTfMdE8gcPJ7xsOY4jW5I6459EsrKJGAp1IlJ3KdSJiFQDy4JIWn0y1n5QEuwCQZJuugGrySGEwwe3CM5MSSHPr5E5qf0iXh+e88/FfVZ3cnv2LfkHAwRmv0T68rdiUy+jm7eQd+kIMj7/iOKQFo+KSN2lUCciUk0KDDeuFkeQNO2fYNsEvEkEg/pgKhI0XdS/cQyF142LBToAOzeX0NLluDqdSPjj1SVtOTlYxX7w1auu7oqIVDuFOhGRahQOR8nDDQagQCcCgG1DwO3D8JXd0dLwebHD4V8eJydDctIf2T0RkRpHG6WIiNRwyWE/9Qr2Ui83i9RQEaZZsnbIY1jUCxRQL5BPkhWq5l6KJFYQB75bxoHHE2szmzXF3asnVvYeAIy0NFLnzsLvSa6uboqI1AgaqRMRqcFSw8UErh9HwSuvAeA8/ljS3nyNkOGEhQvZN+F27MJCvIMvJPrPewDdq0v+HCzLxt/wENK//pTQS/MwvF5cXU+hcOIdpM18CMeRLbGSk/F7kwlpUyARqeMU6kREapCUcDGOgnzs4mKMRo2wd+wg+P+BDiDy5XoCL7yE59ze5F5zfaw9MPslnO2PxzX8CsK6N538SYQMB6HUBrjHjMEXCmD/+CPuQQMxjzuWQl8KYYU5ERFA0y9FRGqM1FARwev/Tm7bE9jXoSt5nbvjSEvFqF+/1HF29l7CH31c5vXBRYtxBor/qO6K/GFCoSh5uCg44igC3bqzz1dPgU5E5FcU6kREagDTNGDbNoKvLIi1Wdt3UPTwo3gHX1jqWOdpXXB1PhmcpSdbuE/vStSj6Zfy52XbNtGoRqJFRA6k6ZciItXEMCA5VIzDX4zhchLe9H2ZY6LffEfa84/jHXoJkW++xdHqSMyGDYj+sJWML1ZRfP/DBJ7/D64zTsN3zZXk/M4GmsmEcRcXQTSC7XJTmFKfiKZrioiI1GoaqRMRqSapoSICI0eT2+o4ctqcgOuYtuAoPaXMe9EACkaPJbdzdwIvzMHRIIOc9p3JO3cAOe274B16CQ12bCLtmcew9+0jLVQY2x2zzPnCxTD/VXLadWTvkcdRMGgIqQU5mPp/AhERkVpN/1cuIlINHA4Ta/VaQv9dXNIQCFD86BPUX7KApNsnkPLIA6S9PBuzSRNCi98BILxsBf5nZ+MdcH7Ja8JhCq65gcgXX7K3ZTv2tu1A4Tn9SQsUlHtOp7+IgmtvxC4qKnn5J2souu1OkqPBKn+/IiIiUnUU6kREqoHDYRD5fF2pttA7y3A0aYJd7Cc471Wi332PWb9eyc2V/1947ec42rSOPY7u/AnjV6N7kfUbCM5+CZer9J93p9Mk8u2mkrs6/0r44zWYRdpcReo2w4Bkl4s004XXo5UpIlL7KNSJiFQBp9OB213x7nyRiIW733ml2lKmTCL/imvwP/gI4ZWrKJo4Gf8zs0i6YXTsGPeZpxP5akPssfeSQYTeWVa69rovcVil18lFIhbOo9uUfHr9FddpXbBSUir9/kTi4XSahJL8hJL8ONzV35f0qAvjhW1EHtyIZ9U+0k1X9XZKRKSSFOpERBLI5XKQEcgn+avP8a5aSUYgD4+nbLizLJtI02akPvcEZvNmGJmZODu0J/zJmlLHBebOx92zO5gmngsH4B30V5Kn3E795YvJ+Pwjkm+fQGjV6lKv8QwbQricP+8Rr4+055/ASE8HwH3mGaRMnkiRPsDKH8jyhFljf8Ylm/9G/01X8XLRIqykULX1JzXioHjCF4SX/Ex0fR6BGZuILt1Nilv/LkSk9tAcAxGRBEot2se+foOIrPsSALPJIWSsWkYwKb3MsQG3j7Q+Z1PvmDZQXIzh85ZslBL9ZQtLo149zGbNSH/vLUIrPqTg1skkjbqC/MtHYmXtxnFkS+rN/w/+VxbgPOIwHG1aYx91VLnbvhe4kkjq04eM008F28ZyOslLqoelLeLlD2IYkOfIZ/i3N8Xa7twxjeaeQ+jiPOkP34nVMIDcMPae0utKQ2//TFK3TH31LSK1hv5ciYgkiM/nIrTiw1igA7B2/UzRIzNJ85Q9PjlQSEHfQeSe1I3cM3rjf2Y2SWPHlDom9cEpFN46idxuvSi6/S6Srr2KvMFXYGXtBiC6eQuRrzbgatsa/6NPUDTxTozNW/DYkXL7WGyZ5CbVJzc5nTxPqu75JX8op9PBsryPyrS/svctoo7yf2erkm2D4Sn7UchIdZaZqiy/zeEwCdpJZBUkUxxNJkI1z6sVqWMU6kREEsThMLG2bS/Tbm3bgREKl2k3A/5S0y2L738IR8sjyPj0Q1L+OZX0FUtwnngCwTfeih1juFzYOTmxx84O7cG2yR9yJZHP1hFe8QH7TuuJt2Bfgt+dSPwsy6aNr2WZ9mN8rTHtitegViUrycTZKeOXBoeB57IjCCqT/E9M0yBk+9hblMSuvQ4Wf2LS7jKTJxa5iRi+6u6eSJ2hUCcikiB+fxDPgH7gLD2z3TdyOEFPOR9uDrgnHdEoRZPuwcrLJ/Cfuew77wL8jz1N8o2/jN7ZwSBGw4axx+6zuhOY92rpOpEI4cXv4nJVz4dkkYpEoxZtPa3oUa9LrK2193CGZPYnGrJ/45VVJ48I7mtbkTTleDzXtCL50ROxD/VSFC77RYyUFbSSuOB2J22HGJxyNXy5GZ66Ge58ziA7z0HESKruLorUCVpTJyKSINEoRFPrkf7eWxTefjcUF+MbOwaz3dEEAmWnloXdXrzDhhBa/A7eywZjNm6E49hjCLy+iMhn68DlIvzRxyRdfy3pH79HaPmHGPXSqP/aS+QNHo61fQc4nTgOP7RMbfPwQ7Gs6vmQLPJbXMUe7m9+C8Ut/ITtMGlGKu4iHxbV8/tq25BLGNehbpwtvRSFokR/ta5VKuZwOnlsIaz+5pe2/7wDw8+FWbfB7hyDBe+bXHu+CzuqkCxSlRTqREQSKN/pw/uX9qTOehIjGiWU2Yh9ReXv7Ffs8pF2790k3zyWwkl3E3rvfbyXX0rSyCsgey++60YRWfs5ZqNMzMaNCX/8Cf6ZT+A47FBSH/0Xzg5/AQuIRgi+sgDr5ywAnCd3wmz/F62XkxrJtsFR7CGVXxaaVleg+7VwOEo4rDBXGeGoyScbyk76+uALyN4H1w2Ee/9jMOwcJz6nQp1IVVKoExFJsEAgQsCTWvKggkC3nxEOkXPa2dh79gBQuOZTUu6/m6SJ48k5oQuESl7v7tML70UDyV+wCGvbdiJn98Daup3Cf9yFo/WR1F+8AGtvDqSmQtOmFHhSytxoXEQkkdyOKAPPsHnrk9KbynQ9Di6ZDKvWwx1X2Lic+oJJpKppTZ2ISDUxDANr67ZYoNvP/+RzRDd+Fwt0AKE3l+Bo3gwjORkjIwN3l5MpuO7v2Pv2EVnzKTntOxPdto1A66PIdydjK9CJSBWLRKKc1RHGDIQkLzRKh0euh4/Ww948+HwTHHM4uM3quw+hSF2hkToRkWpi2zZm/fpl2s3GjUpG3Q5gFRRAUhLOo1oRXr22zPPBF+bi6tUbnOXcP0FEpAoYWPjcDj57GjbtgOfehPkrSp477BBwOyyt7xX5A2ikTkSkGkXT0/EMPP+XBq+XlIfuA0/pYGY2aoSrQ3uSxv0Nz4ihuLqfXqaW6/SuRD1lA51hQHK4mHqFOdTbt5vUUBGmqXtwiUj8vI4gg86w+NdcKA7AO///fVNGGrx4u0WSK/jbBUQkIQy7hs3R2bu3sEZ8o5OZmUp2dkF1d0N+Rdek5tE1SYzUUDHGrp+IbtuBq+MJFPtS8ERCRN9aQvDZ2ThaH4nvtgkU1m+AaZZ8F+cpLiD8r0fwP/woRKO4Tj+VevNmkWOWvXVCWrgI//XjCL6yAABnxw6kvT6Pfa7kP/R91kV16d+Iw2FS7PNSbBh4AU8ohBmseZtj1KVr8kdxOk2KQh4cDpNwFAJB8HlsfM4gtvX7m8/omtQ8uibVxzQNGjRIqfTrFOoqoF/mmkfXpObRNUkcwzAwTaPUjpVOp4m7uAjL7SZoOMusk/NFg3hCAexIhKjHh69Z4zLXw+Ew8a1eRX7v/qXak6dOJjLqasIRbWBQlerKv5Fokge/y8XPEYNFRTAtBx5qZNPXE8EsClR390qpK9ekNtE1qXl0TarPwYY6Tb8UEakBbNsucwuCSMSi2O0jgKPcjU/8Dg/7fPXIS21Aobv8G/yaplFyz7sDRFZ/ikP3jZIECKcmcXOei1ZbDE7fBnuj8PghcHWWQbbpJJyWhKHZviIiVUqhTkTkTywSsXD36VWm3X3xICIOVzX0SP5MXC4HywIGs/INLCACPL6v5LnWbsiKGtybY2J79LsmIlKVFOpERP7EbNsm0vgQUp9/ArNZU4z0dJLumIh5WlcimnopcbJNk7f+j737DrOjLPQ4/n2nnb6bzWY3CYEACZAQCIQO0pUqPV6v6LWjglcBsQAXuaAoUhVBLooF7CggiCLVQq+GACGEnhASUpZNtp0yc6bcPzbZsOyGkmyyu2d/n+fhIeedOXPeZE6Z37yt1PdS4tEKzEjBFi4sCg0VR6FORGRD0pIGIiJDmG2D49j4/jtPNrA2JSeNe+TRFA48AAP4mTydifrDyQCIYj6YS/h9R+/3074ZaLDg3BY4fQyYWDcQREQ2JLXUiYgMQZYFDZUOCq8tIPX4Y4yutFPI2ut8vGqU0Jku0JEu4CvQyQAJw4gDMvCperABz8DXG2CKB+e2wu86IW8ScoEWnxYR2ZDUUiciMgTVlzvo/OKpBH+7AwDT2Mjoh/5Batwm69VqJzLQTBiyR9rljNHQEcONnbDzgu7xdS5QZxJ19RUR2cDUUiciMsR4nk382uKeQAeQtLbSdfa3yZY0xbQMLemKzyGZmHPfgHICv+voDnQpAz8fl5Dy1UonIrKhqaVORGSIcV2H6NVX+5THCxeB70O2/+ULRAZDFCU0lEpc1pDC2DaPTDRU4oSMgZQfYIbgAuQiIrVGoU5EZIgpFn1G77UHpNNQWbNwc+rjHyEe2wydukiWoSWKErxVi4y7QGZwqyMiMuKo+6WIyBAUZ/OMvu8u3AP2w952KvlLzic18xjaFehERETkLdRSJyIyBLXhkttue+p+/VMIQ5LmJlaWNNmEiIiI9KVQJyIyRBWLPsV0ffcDBToRWUeWBVoqUKS2KdSJiIjIO0qlui8ZfD8c5JrImyWWix+6AHhOTBDa+AGkvZi0l1AKHBYvg03GgOeEOEnlHY4oIsORQp2IiIisletaFDrb8G+5G6KY3AcPoavQQFBV089gC8lw6XU2P77F8PXjobHO4syrDUEVtppg8ZcLE4443TB/Cdg2/PwMh8N2TWFif7CrLiIDTKFORERE1qrQsZIVu+9HvGw5AF2NjYyedT9BtmGQazayOY7FvU9a/PAGg2PDB/eEfb5kSJLu7S8thtN/bPjkYfDtayGK4JQfGp7+pUvGUqgTqWhRYiwAACAASURBVDWa/VJERET6lc26lH97XU+gA0haWyn/7Jdks94g1kwwNjff330ZV5+HJa30BLrVZr8IW01Y87izBFX1nhWpSQp1IiIishaGpK29T2nS1o5lmUGoj6xmiDlwp+4Ut6IDxjeC5/be56BdYNYLax5vPwk85y3JT0RqgkKdiIgMONe1yGRcXfgPc5VKQOaET4H7prRg22S++DlKpWDwKiZUqxFH7BVz9N4JSQJX3gTXfzth4tju2S6P3ifhvBMSVnQkNDfAEXsl3Hx+QiGliVJEapFJkrc21g+u1tYu4njwq9TUVKClpXOwqyFvonMy9OicDC1D4XxYFtSXOwmfnkM4+ylSRx5O1NREKZcnNFW8OEW1Gg1qHTemoXBO1lc2DvCWL6V00Q9Ioojc6adR3WQCRSs12FVbJ7VwTlYzBoI4TRDZGANZL6JSNRgsbCskZQcEcZpK1SLlJKSdCmE49Ca4qaVzUit0TgaPZRkaG/Pv+XmaKEVERAZMXbmTrlO+hn/zXwEonn0eheuu4a4DUtzafg8fHv1B9sztjFXUeKzhomR5BJtuQeby7wPQZblDMhiMREkCrqngrr6aiyD9pj5YUQQ2ZXKrtocaTydSs9T9UkSkRjmORSrl4Dj2RntN41d6At1qxTPOZWJnin+1P8J/zz+HW9rvwvbULXM4CcOYztimM7YV6EREhiCFOhGRGlSolkg/cB/JV75K6rZbqauWMBsjR/UztV7SVSRl1rTMXbPsBspOaSNURkREZGRQqBMRqTHpJCS4/Eo6jv4wlWt/Q+fHP0v5tNPJVssb/sVzOZwZO/QuO+UEbogf6nlYsHOYRC11IiIiA0WhTkSkxqT8EuUrftyrzL/+T7jBhl9wuDM/ilG33kj2nDNJHX0EheuuZdlnDuX3bXcAYGHxv5udTKaa2+B1ERERGSk0UYqISM0x9OlruVH6XnaPvVrhFch+9TRcv0I1k6PJ7uL3dT/kpcqr7Fe3O7kwT+iPnBkwRURENjS11ImI1Bg/nSH79VN7laU/fjyBl95odSgFMZ3Go1KpYhdTTIuncmz6cOpKo7B83U8UEREZSPplFRGpMRUc8id9nvp99yb46224B+6HtcfudDgbL9S9VRQlRJFa50RERDYEhToRkRrU5Waxd9kdZ489qUSxpqEXERGpYQp1IiI1KopiokhhTkREpNZpTJ2IiIiIiMgwplAnIiIiIiIyjCnUiYiIiIiIDGMaUyciIiIiAyY2Ln7oEUbguZAkCZ4dYhOQJINdO5HapJY6EZEa5TgWhbhCoVrC8+zBro6IjAAxHnfNSnHWzyyWrbT4zAUWu37e5vSfePhJDmPMYFdRpCYp1ImI1KBsHJB7bi7lT5xA6fhPknrwfgphabCrJSI1Lkg8xjcaDt0djIGPHQznnQDbTDR899eGEG+wqyhSk9T9UkSkxhgDqbZWWvc9GFYt+N1+/4M03H83znY7aM06EdkgIjyu/BNc+gewLPjjt6C1HX59J0wYA2f8lyFBvQZENgS11ImI1Jh02qXyhxt7At1q5R//jLRRoBORDcOPXH5wfXf3ymP2hnmvwhk/gbnz4a7H4Zj/gWqkS0+RDUGfLBGRGpMkYE2Y0KfcmrAJiaW75CKyYVRDiFfdN3r/LnDjPb23d5Vh3qsJlqVxdSIDTaFORKTGVCpVUocdjL3t1J4ya/w4Ml/6AqUgeptnioisu7Qbs8Pk7j+/0Q6bNvXdZ9zo7tkwRWRgaUydiEgN6syPouGuWwifeZak4uPsuhPFXB1JONg1E5FalXEq/PXCLBf+zvDUS4ZvfRYemAMdxe7tH9o/oaku0rIGIhuAQp2ISA2qViNWpOpw99wbyzJ0+iEo0InIBhTHCWmryNmfynHbw4bbHobbLoblbQlbjocxhRDXVAa7miI1SaFORKSGVavqbikiG0+SgJuUOHS3NOXAwnVg0vgQO/HVQieyASnUiYiIiMiAieMEz5TxUqsLQHlOZMPSRCkiIiIiIiLDmEKdiIiIiIjIMKZQJyIiIiIiMowp1ImIiIiIiAxjCnUiIiIiIiLD2ICEuosuuogzzzwTgHnz5jFz5kwOPfRQvvnNbxKGWhhJRERERERkQ1nvUPfwww9z88039zz+xje+wTnnnMOdd95JkiRcf/316/sSIiIiIiIishbrFera2tq47LLLOOmkkwBYvHgxlUqFGTNmADBz5kzuuOOO9a+liIiIiIiI9Gu9Qt0555zDaaedRl1dHQDLly+nqampZ3tTUxPLli1bvxqKiIiIiIjIWjnr+sQbbriB8ePHs9dee3HTTTcBEMcxxpiefZIk6fX43WhszK9rlQZcU1NhsKsgb6FzMvTonAwtOh9Dj87J0KNzsv6WrYBKAMZAIQsN6/lPqnMy9OicDC/rHOpuu+02WlpaOOaYY2hvb6dUKmGMoaWlpWefN954g+bm5vd03NbWLuI4WddqDZimpgItLZ2DXQ15E52ToUfnZGjR+Rh6dE6GHp2T9Vc1eU69Am6+z5By4evHJ3z+6JgUpXU6ns7J0KNzMngsy6xTI9c6h7prr72258833XQTjz32GBdccAFHHnkks2bNYpddduGWW25hv/32W9eXEBEREZEhJJXy+NXf4KZ7u3tiVQL47q8Nh+9psc14iyiKB7mGIiPTgK9Td+mll3LBBRdw2GGHUSqV+OQnPznQLyEiIiIig6AU2Pxrdt+hNfc9BZ5nD0KNRATWo6XuzWbOnMnMmTMBmDp1KjfeeONAHFZEREREhpCsF3HATjZ3PNo72O27AwRBNEi1EpEBb6kTERERkdrk+wH/dTAcu2+CMZDy4KxPJGzWHKvrpcggGpCWOhEREREZGVKmiytOzfL9L1lYBlJuiBVXBrtaIiOaQp2IiIiIvGtxDB4lvNVXkWqgExl06n4pIiIiIiIyjCnUiYiIiMg7sh2HSpyhEmVwHM10KTKUKNSJiIiIyNsKyfCL21O8778dDvm6w/1z04R4g10tEVlFoU5ERERE1spxLB6YY3Hm1RavvwEvvAYzz7ZYWXQxfZesE5FBoFAnIiIiImtVjR1+e3fvS8YkgTsfNeqGKTJEKNSJiIiIyFql3ITT/jPhX1fAzefDQbt2l28/KSGONfWlyFCgJQ1EREREpF+WZdFWSnHuNYYHnoYJTfDDk+GAGQnbbRETRclgV1FEUKgTERERkbXw4zSnXg4PPN39eHELfPJ8mPdb8EyZRJlOZEhQ90sRERER6Vc1srjv6d6zoZR9aGlDgU5kCFGoExEREZF+2VbMrlN6pzfXgTH1g1QhEemXQp2IiIiI9CvnVvi/r8KUid2PC1m49n8SPDsc3IqJSC8aUyciIiIi/QrDmOZCmTsuSROEBteBrFeFyB/sqonImyjUiYiIiMhaBUFExi6SWb0kXTSo1RGRfqj7pYiIiIiIyDCmUCciIiIiIjKMKdSJiIiIiIgMYwp1IiIiIiIiw5gmShERGeE8zyaVcgnDiHK5OtjVERERkfdIoU5EZAQb5XeSPP8qlT/9GXfXnRl9wL60ZwpEmt1ORERk2FCoExEZofIeVK//K51f/hoAZcA7+P3U/eLHrEzXDW7lRERE5F3TmDoRkRHKbVtJ8fxLepUFd/8T/Mog1UhERIYy2zbU5VMU8ils+533l41HLXUiIiOWIQn76WcZJxu/KiIiMqQVbAe3GBP88VWMY1F/5Cb4GShWw8GumqCWOhGRESsq1JH76sm9ytz37QmZ9CDVSEREhiLbtnC7YoqnPEH1tiUEf1lM8dQnSPtmsKsmq6ilTkRkhOqILBo+cTz29tPwr/8Tzi47kf6PY2nP1EMUD3b1RERkiMilPYLr5kP4pp4c5YjgnuXkjx5HV5c/eJUTQKFORGREW+kVyBxwAOkD9ye2bFZ0+Qp0IiLSm0nA7tsqZ1yDOuwPDep+KSIywpXLVTpKoe60Sr8sy1DNZ2mvy1OuyxGnvcGukohsZMWyj3f0BEitiQ6mzsHdt0m/HUOEWupERESkX8ZAsZDj2MWG2T54xvDdMR6fyFnYRc2SKjJSRBEEGUPuyl2o/msZxrVw92+mnEpAmW5IUEudiIiI9MtkUnz7DZi96qItSOD0FkO7rXvCIiNNVxiywg7huAnER4yjlSolXzNfDhUKdSIiItKvLtvmsX5mt3vGB9fVJYTISNTZWVGXyyFI38giIiLSr2wc8/5M72kQDLBDGqJI0yOIiAwVCnUiIiLSL6vs87VGODbffcHQZMO14xKySUysRepFRIYMdYoXERGRfsVxQq5U4YrmNBc0dQe7UcRYnWVNYy7yJrZtkc16hGFEuVwd7OrICKRQJyIiMsxZlsG2LcIwJkkGNm7FQUg66CLvWMRxQhwnCnQib9KAi1lZpfq3RXgTc2S3q6fNCoiiwa6ZjCQKdSIiIkOEMfBuM5ltW6Rcn5TjY/kvQtcTJPUfwGcsxUpmwOsWhlqUXuSt8hmP5Il2Spc811Nmb19P/VensAK12MnGo1AnIsOWbRss3yOJwNgQp6pEkS48ZfjxnJh8uhOqS8Bpwg9zFP30WvfPp8ukwmewKi0kK2/HtPyqZ1tm8jXEuQ9Trqg9TWRD84ox5d+/2qsseqYdKhGs/SMsMuA0UYqIDEu2bQjf8Ljxs6384rBlXP+pNwiWONiWvtZkeHEci4L9AvZT22LP2R179iTS7deQTfU/ZbjjWKSCf2M9+wFITewV6ADMwjPI2Qupz5Y2RvVFRryk2s/NRM0OKxuZrn5EZFiyfJe/ntZK22vdC592vB7x51NasQJ3kGtWe4wBx7GxrL7rlcn6y7rtWC9/DqKuVSUJ1sKzSLvlfvdPOUWsJZesetTfxWQXJngdp/XnpD21XItsSNWswTt6Qq8ya/Ms5NQZTjYuveNEZHiKDStf7Q50tgdRAF3LIpIQfbMNoChdpcNq55HOJ5me3Ybx9jjskjfY1aoplpVA5eW3lMaYqAsY1Wd/14lI7AIGoDIf8ntA16Nrdhj3RXCasNruID3m01SCug1Ye5GRrbNSZfTeY7DGZajeuxxrsyzewePodCMIBrt2MpLo0kdEhicLtp2ZY8vjCrQHFg25hNfvK2HUUDdgbA/+VX6Y0xZ8t6fshOb/5EuNn8Kq6B96oARhBqdxJqblN2sK3bHg5PtMnGIMWFEHZpPTwOTAzsE2v4Xlv4LiEzDqIEhPhuc/DNv8jiRR66rIhrYiqZKeliW13WQiK6G1GCjQyUan7pciMiyZbEjTRxp431ku+5xus8MpDs+PLxC4GscwUCpume8surJX2bXLb6Tq6GplIJUDl3izi0jGfr47zNUfCFOuxyz+AXWZYq99jTHgL4CoBHV7w/zT4LnjoOFw2PIKqD8UVt4BlRdJlv2cKMn1eT3XDkkl7aSSdjzT/7g9EXlvKpWQ9rJPV1HfjzI4FOpEZFjqqHiccLGhc9VcEFEEp11pqFTVgjRgDHRFRVxc3lfYmU3cZmJiQrT40kBKEqhUU5DfHSZd0R3QXvosZukPsaPXuoPcKnGcQG4GhCth/sngvwqlZ2DO3t1h79mDILsdNH8aU55HklR6vZZHidf+8XP+cuJUbv7slsz5zRlkrCL9sSyDMQaPMl61BdPxCmk6sW21/okMJbZtacyzqPuliAxPxhheeK13WTWEcgCZgV+ia0RyQ4/fTTifXaubET45B3uLibSP84ji1GBXrabYtoXnVDBxGRZ8D/z5PduMvxjLnkb0ppn0gihNZuVf+x6o7S7I7QwLvgHT7iAuPUcQ5Vk9mYplGYLW15h1zek9T3nprmsYs83ujNv741RXzeBn2wY3aqf91Xl42Tr8OOQf5x2D3/4GmdHjOeySB4i95gFf5FxE3hvPtsiFFtG8Lswol2Rsik4ifTZHKLXUiciw5JiIo97X+4drfCPk0voxGyipJMWuCzxWTtmFzmM/StuMvUl9+0c0tmlB3YGSTfmMsp7Anf9ZkvZ/wFY/hab/6t5oZUnyu/RZezGoOsSZGX0PltkagsUQl0nsOsL6Y3qCGnSHx6VP39PnaYse+ysm6u6GaQw4fgu3njyDv3/zYG47bQ/+/YtvsM9p1wJQXrGE2b86C1fdNkUGlWUZsl1Q+tIs/AvnUTnzaaqXPk8BGwDPsamLbQqJjefYg1xb2RgU6kRkWHJMwOWnJnz0oISGAuy7A9z9g5isU3nnJ8u7km5rpfPUb4C/5gK+cs2vscpa/2wgWJYhw3ysuftB252YFbfAsx+E8aeQjD6GePuH6ar0nbkyqMYkTZ/q7ma5Wt3+4G0GxSchtSWR3UR7MdvreVEU07Tt+/ocb9yOHyCxumc0dayIOX/8Hn77Gz3blz/7EGGli8L4yQB0LHqOJFSoE9mYMsaiIXEZVTKMwiWLTfCbBVBZc+MmntuBaQnI2w7pR9oIzppD9ZvPkH6yk5xRsKt16n4pIsNSkiSkTJEffNHjgs872FZM2vF7dVOT9WPimHjx633Kk7Z2zKhm1MNn/XhuglnaeyIakipJ2z/wN/8lpYqz1vdzW6mOuil3Y8crMJYF/iLMyydC/YFEW15NZ7ke6P3cOE7INk9i2syvMe/PPySJIzbd7Qi22P+jlMNV+0ZVii2v9nm9UuvrpOoa6VzyMlvsfzx4dRDqDSCyMWSMhft4O6WrX+5e1DzvkL10Bn5/vSbaq9gGKle91FPk/+B5Mt+fgdVodY/LlZqkljoRGbaSBEwSkLZLuKaiQDfAqoV60v/1kV5l1thmzFgFuoGQJIbEHd+33B1H2ffe9v0cxwltxTyt5Ym0ljajbO9Eddv7KG9+HW2V8YRrea5Pjm2OO5PjrpnPzGsXsNvJv6DyphkyEyfLNoef1Os5lusxdvr+lFcsZfv//B8mfeAzVBXoRDaadGzhrw50AF0hwc2LcI94y/dHxsbeKk9059I+xwgfaMFRN8yappY6ERHpVzG2aDjty5h8Hv/GP2NvPZn8hd+mWjcGAo2rW19+kJAb99/Qcg1UW7oL05NJ6g8l7Hr3M4wmSULRTwGrJ7B5+8BVTVJgvWmymzftHoYxo6fuzT5f/zXzbrkcL9/ALp+5iFTDJhxyyYMkXoFyqPvBIhtTUonXBLpVwn8txzt+IqmvTSG8fQmmwSP14c2IXilibZkHlvXa39oyTxz3Hp8rtUWhTkRE1ip0R+G9/1OkjvpPkpJF5aoVuMe14+1QIAi1tMH66iiPpm76bOh6FGNSxLmd6CjX8U7BbEPykyxjdpnJvtMPwlg2kZWjEsXdfXvCQauWyMiVsTB1LklH98009+BxuIeNgxjinepJb1dP9e5llL4zl2RlQPaCHbEm54lf7ure//iJONvXk++KIO1QtGKqCng1R6FORIYtyzIEsUc1ssh4IZ4dE0VJn9kCZd05QULp4pd6lQW/eZXU97Yn0LJI6y2MElZ01WHbhwIQdcUMZqBbrVqNweS7q6LPk8igsG2LbGgwQPbSGQTXvYo1KQcRlM6eA36Es18zfGQiwR9fXb16CeWL5pH5xlSod8E1JAtKFL/4b/BjTMEhd+72dDXZhPps1xT1oRCRIcuyHcpRhiBOY9u9v65s22JFKcvnL3G58V6H5xeluOovGR5/MUOVDEaBY0Ak/Q2qr8aYwc8dNSWKYt2MEJEetm1RKEL4nXmUTnic8nfn4s7cFHvX0fi/nA/lCGII71lO+Fgrzi6je56brAgI7l1GOW9ILEP5B8+D3/39knSG+N9/jqy6UdcctdSJyJAUmiw//4vh2tstxjfCD0+22aLZh7i7/1c5yrCi0+K8z0EYwjFnGYyBXNowYyvD5SencU3/yxu4rk0YWzhWTLWqLoRvJ05bWJNyxK8Ue8rc4ybge6grnojIBpINDf4FzxK/1r2ETLywRPm7z5I5a1qffcMn20h9bCLW5DzhrBWkPjIRqylN3B5juVZ3AHyTeElFrTo1SKFORIYc23H4zZ0W37q2u7nt1aVwwCkW836bImOFVJM0P7rJcNn1EEZw7L7wwP/BswugvQiTNjEk9D/LV0iGOx61uPk+i32mJxx/UIxHmUTTOfary4qo+9/tiO5dTvxyEefAJuItc/ihEp2IyIZix4bgtd5rgibLKphU3zhmTy0Q/Gs5rPDJnjmN0nfmEi/ovhGXPX8HzJgUyRtr1pa0p9URqrtFzVGoE5Ehp+g7/O7vvftPVgKY8zK8b1uLBUttLrluzfYTjoBPng+PzO1+PLoOHvmJIZUyvcJaYrl8/w82l13f/dxbHjD87RGLX5+VwltLq95IF8cJbVTxPjAG6/1N+MSECnQiIhtUYoFpTpEsXxPGTIMHnkX2sp1IVgRU72shaQ9wZjRQ+ubT2NNHUb1veU+gA/B/NZ/s2dMoX/ki8Std2NuPInXK1rRbUc8YPKkNan0VeZeMQeO0NhLPjpm8Sd+7iJs2gzGGWc+vORGTJ0BHaU2gA1jRARf+FrB637cKIpcf/7n3SbxntsEPtXbPOwmqEZUwJAx1FSAisqEVnZj0GdtiRrkAmDqXzJnbEs5tp3T2HCqXv4A9tUDq41tQPv9ZCBOsBo94Se8blNELnVSfWEnmrGnkfrgzqU9sTuLqYqYWqaVO5B0YYyhgY3WEJNUEGl26LE1qsCE5VpXvfcHlvqcMLW3dZZ84JKGpLiIMI/aevmbf0QVY0tr3GK8tN4SR6dUJM5+q8NDlHSxvs7jgj83c+2T3ODxbt7dERGQICaMYv9kl+/2doBqDZxG1VfG//zzQPTGtf/XLZM7ZDsZ4WI0e4cudpD++BdW7ei8+7kwfRemMp0haulv9rM2yFM7bjnYNjK4pCnUi76AOh+CiecTPdQJgxqapu2AHVqrfwgYTxwmjs2X+/bM0r79hGJWHfDrCNRWSBEYVEi46yfC938IzC+Cnu0DKAz9Yc4yTjolJOSGrewqOyhdxXr+QHTuuhdRYdv7aVXz7xr0oV7N4tn7YRDYUx7FIVw0kEHhofUORd8GzLFKLfEo/eJ6kxcfevp70iVthGjySlat+7CzAMmRO2oropS7sSXnI2qS/NoXgjwvBGLyPTiReWOwJdED35CvLA0yzrfHkNUShTmQtUm5CmgrMK/cEOugeqFy9Ywmpo8bia+bEDSaKYlKmxKRmeOtvjkVE2bf45+XdM18+8wr89UL4/h9gZSd88diEPadFPV0FU57BfuOXmKWXrzp4B42vHcYln3+Z1soEHHxk5HIci3xoYWGISSi6cfc6bbLe0sYitSig+qsFJJWI9LET8GbU05Xou1Pk7WRjm/K3Z0Ol+7soeqYd/9fz8Y7YBP+3CwDwjtmU6LkOgj8s7Hle+tRtSHZrIL1NATpDaHDxL36u7wtEMcbYfX5fZfhSqBPpxyiviP3vH2N1Lsb3T++zPVlUxtIX4UbR3w9OxgmYub/Nly6z+NpHuidJ2WoC/Nch0DwKDt8jgXDNuALP6sRqveEtR4lwK7NI22OIdH05YrmuRaFsUbniBaJn27G3LlD4yhS6Cg5BoBbc9WEMZPxViySvysj+5S+SPmc7nK3SGp8p8naKYU+gWy18ph3vuE17Hjv7NVH6xpO99vGveYXs1jtS/PIsAOxt63CP25Townk9+5ixacy4NHGs77haopEkIm+RcSLshy7Fuuc8mPsnnBkOvGVMsXPIOALd3ho0URQzrq7Cb78Zsf2WMf/+WcL+MxJIYj64Z4ydlHvtHyZpktyOfY6TpLcijnVhOdK4lsUoHEZVLApVm/JF84ieaYcYouc7KX93LnlfEwmsL8exCR9f2WeGvfCupbixwbL0byyyNiZng9P7M2JPzsP4NLmf7kbuZ7thcg6Eva9FkmJIUlwT1qJ5HcSvFsleOgPn4LF4n9qCzIU70Gnrt6/WKNSJvEUq6cJ66jfdDyptmOd/QvasCVhb57EmZkmdujXR5hlNlDLI4jgiZZXIu0U2G13k4i9UOOU4n7RV7HNuKoFDvOm5kJ605vnjTqZqxqrryQjjOBa5FRHlU2ZT/u9ZJMt94pe7eu0TLy73uVCS9y6OE8z4dN8NB+WJ/aWUFzxMKlqBY4K++4iMcGUTk/rqFMh0T/dlxqVJf3lrosdXUPrfOVQueQ7iBHtqXa/nOXs3ET3T1qssuG4hSYNL9KmJ+AePYWVS1TVMDVL3S5G3iLGgflMoLgfAevwSrE3vI3Pm7VRJUXEh0DpdQ0ocJ2/bVS5JoL08hsK292MlHWBlCMIMXeUMrgs5twPLRISxS5dfII51QV+rMlUL/wfPQlf3+yXpCjGNHknrmmBh6pw+d8jlvYuiGDM5j7VtHfG8DgDs/2xiwdJbeOxbpwJgLJsDzr6JUdM+QBDoIlNktUoSk0zLkbly5+6bTK5F+FwH/lUvARAtq1A6dw65C3fEv2Ux8bwO7J0acD8wlqSzivnzYpLO7u85a/t6IgMVdSmvaQp1MuLlcikc21Dxq/h+RMnU4x5xFda1B/SMy4on7EHgVukKHTQD8PAURTFtxQJQ6Cnz3JBCMhvr2Y9DsAQ7vxvONjfQVmpUsKtRtjEEi9Z0zw3+spj0SVtRvux5KEWQtkh/ZQoVN9ZnfQB0EFI4fQpWe0gSxFQbO5n1uTXjlJM44pEfnchhF99DynbAyeGTG8QaiwwdfhzjmxhcyCWQ3Pp6r+3JMp/wtSLuwWOJRnlEr3RR/OK/sbetI/Od6fi/WoC9XR32QWNpT8LudRCkZinUyYhl2xYNThs89SdYMht3h4+Sb9qO1qCOzvwU8qe8CG88B/WbUnUa6Aozg11lGWD5VAnryaMgLnUXdD2O9cpJZCf+iq6KznctComxdxpFNLu7e1L0dBvVRo/8lbuQlCPI2Ph2TEnT7g+IJEnoIMTUA9i4QUBU7T3bbLltGeWVy7jjjAPY/cQfsuk+H8OPU4NSX5GhKnbAGp+BZzt6lVtNafzLXyR6bk15NKedxDJw6mQCNJJAMwAAIABJREFUC/ywurGrK4NAY+pkxGqwVsB1H8Lcdipm9i8xvzoUM/cG6jIJQWSzIqynbfSerDAT6Iyyg11dWSWbChida2N0rpW6TNf6TbYQta0JdKuYjvtwbI3xqVUlKyb15a2xd2kAx2BtnceduSntdsSKTMwKqhQ1HeqAS5LugGfcNA1bTO+1bbPdj6Rl3kOQJMy65gysqDhItRQZuipxjHf8RMxor6fMfl8jeBZJqW+3gqQcUU5i/LfcoPIsi1GJw6jAoh4H11YUqBVqqZORq1rELHq0V5F58FK8bY8FGgHUBW+IKWRKeCuuxlr0XUiq2IX34Wx9Ayu6Cu/85P7Y9WClIV6z/AGF3Qkjd2AqLENOHCe02xGZL08mnRgiEjrtRJMGbCRVq473f+tWZv/6m7zx/KOMnb4/Wx/8Gf5x3jEAREGFOKyCPoIivSRJQqcXkb90BsmKAJO1idIWRTchfeQmPWPtAMxoD9PkkSS9w55nW2ReCyhfMA+KIWaUS+5/t6NrjE2o78BhT/FcRi7TTwuPsemzfoEMCcaAm7RgvXYuJKu6knQ+hFn6Q1Leuv0Ylfws8dbXg9PQXZDZlmjSzykFGtNTy+I4oZhEtBPSRaRAtxHFcYJvj2aHz17OweffRcOWO3DX2Yfit78BQMMW07Ecdb0U6U8YJbRRpaPR0JaO6SQkqEYkuzWQOmNb7J0bcI/chMzFO9Ll9P1ey4QW/sXdgQ4gaatSueQ5sqHiQC1QS52MWImThS32wyy4b03Zfv9D4DVqOvMhyLYtKD3Vp9zqfAB3THmdJleoVF1idz9y05/GEBAlaboq9Vq7TmQDSpKEauISOU1s/r7jWPnyUyx5+p80br0bu55wEaE3Gqr6DIqszVuX4ulMQpztsnhTJxPaUIxCkn4mejJhQtLRe0OytIKVoPvZNUChTmqabRsKpgOr0gpOitCpozPKkSTQFjfQ8B+/I3n5bljyJGz/YZL6LekoKdANRVEUQ37XPuXxqMOpxuveshZULYJq/ZuPuM7HEpF3L45jKlY9O376Iqb7nRgvS5DkiBXoRN6zMIy7J+x9m1l7E9dgmlIkLWsmK7Im5YjUUFcTdBqlptVb7bi/OgD7qh2wr5iC99fPUu90LzQcRQlvBPWUpnyM4KDv0zFqZ1qrde9wRBksSQJ+NJp48jVgjwIsksaPEDd9Hl/zmogMS3EM5dDDtxupRBm1kotsQEUrJn3OdlgTuyd/s7bOkz5jW0q2Pne1QC11UrNSLpjHroTWF3vKzIu3Y7XMwW58H1HU3SJXKikRDBdFP02Y+RDZHQ4BkxCEaUpFjb8RERF5J2Ec01lnkT13GjbdE0V1OHHP9ZAMb2qpk5pk24YsHVjTZsIJ98FuJ/ZsMy3zsCy99YcrPzCsLNaxsqueYiXVZ2yBiIiI9C+KYjpNRJsJ6TSRAl0N0ZWt1BxjDPW04vzmULh6d/jVIdAwCfY9A4wh2eowQi0sLCIiIiI1Qt0vpeZkbB/rrtNh+dzugrACd58JJz5KvPkBlJ1GEmU6EREZANmsh+va+H6VSuVtZqkQEdmA1FInNcc1VczUY+Hg74GX7y5MEhK/i86x+1GONAZLRETWj21Do+XhPriC6CevkJ5XYrRZs2q641h4VPCsKpal+eJFZMNSqJOaUucUcYuLSRbcC1EIX54D0z4ElkPcsBVBVU10IiKy/uojl8r/vUjlRy9SvXsp5e/MpXrHUuo9D8+U6Xrpfh657GPM/ukXsYqLcKy+Mwy6doQXt+NRVvATkfWi7pdSMzzPwWt9FvPLg9aszDnrZ3DCfcQ7fYZSsu5rmYmIiPQSJYSPr+hVFPxlMflDx9H52lP845zDe8oXPXorx1w9j9BqAMAYSNPFM9dfyGsP3UzdZlPZ/aQf4eXGkPgd+F0rSY8aS9WuI9K9SBF5F9RSJzUjm7Rh7j2fXtMhtr9GsmwOpQkHUonctT9ZRETkPXi7drV5f76s1+PQL7HkqX/iON2XXa6p8vTvv8Vzt1xBseU1ljxxN3eevh9B22vc9NlJ/O2UXfjrf0+HjoXYtlrwROSdKdRJDTFg+nlLGwtNdikiIgMptg3OHqN7lXnHbUqUgnRdc5/90/XNPfccTVhiwf3X99peaVtO0LECY3d3ogq6VvLYVV/Cjoob5i8gIjVFoU5qRsnUkRxwTnew83JQPxFGT4ZxO1LVWDoRERlAHXaV9Be3Jv3VKbgfHE/m29vjHjyW9iBkh4+ejZut69m3YYvpNEzeiSjqHleXYJEfu2XvAxqDk86RRGtm0OxY8hLE1Y3y9xGR4U1j6qRmBEFEMGoK3ldehFIrdC6G8TtRojDYVRMRkRoTRdBKQHb3epz3NVL2q/h+dwCL080c/eO5LJ/3IKlcA/VbTCcwhZ7hAbFXz54nX81dZxxA6JcA2PGj/8vrs//e6zU23+fD4OZhAO5Lug6YoAOMRezWEYZ9J24RkeFLoU5qShxD8siVWA+vGs/gpMh8+l8EdTvoB0xERAZcqVQFeremhbEhNPWM3uFIkiShEiW9xnuHYYw7ZhuO+enzFFsWkmkYh/FyBO1LWfT43+ha+gqb7/sRtvuPMyhH9nrXMW1KLLz3jzz31x+RKoxm1899n9S4aYSJxpqL1AqFOqkpHqU1gQ4g9LFu/SLZj95Gh1rsRERkI3q7m4lhbBGaepyxOxAkCSRgN2zFPmfeBEkIbo5ytP6Xaa5rs2zW3Tx+9akAdAJ3nnkgx/3sBUJnzHofX0SGBo2pk9rid/Yt61iEhVrpRERk6Ene1IIXRTGByRFY9QQDEOgAqHYx/77rmHTgx2ietnf3a0YhS+fc0zMbp4gMf2qpk5qSZBqhMB46l6wp2+Hj+EZr1ImIyLpLmS5MWOleQ66+icgdRTAM5jBxHJs9T7qCJU/9i1zTZuTHbsHfzz2SwvitiOPknQ8gIsOCQp3UlM64jrrP3o919+mYlnnE232YeNcvUg70VhcRkXWToouX77iap35/HiQJXr6Bwy76F+7orYf07Mqua1FZPp/bv74PUdUHYOz2+3LQebdhvDy+Qp1IzdCVrtSUMEpos8aSOexq7NjHtwv4Qf8Lt1qWIWd14RKQYFEmRyXUR0JERHozUbkn0EH3GnKPXPUl9j3jD1SpH+TarZ1dbefRa8/sCXQAy565H7+zFad5PEQKdSK1QlewUnPiOKEYp4AUaxtKZwyMcjqw//RfMP8e8PLkD/8BzuRj6QrTG7O6IiIjnuta5Ks2JkzANgROQteb1msbbH5Ha6/ZKwE6Fr0AcTSkZydIoipB18o+5UHXStyxBlCoE6kV6/VVdOWVV3LEEUdwxBFHcPHFFwPw0EMPcdRRR3HIIYdw2WWXvcMRRAZH2omw7v9ed6ADCLowt3yBdNQ2qPUSERlpbNui4FtUzptL8QuPU/zyLOxZ7RTM+k/lP1DSDePw8qN6lW22x9HgDu3x2ibbxDYfPKlXWaqukVGbb69lfkRqzDqHuoceeogHHniAm2++mT//+c/MnTuXW2+9lbPOOourrrqK2267jWeeeYZ77713IOsrMiC8pIRZ0M97c+mTeN7QuZAQEal1ucTC//krxAuK3QXliMqVL+BG/XedHwyxU8+hF/6Tpql7kqprZKtDPsOMT5xHkKQGu2pvq+JHTNjtCPb9xm8Yt+OBTPrAJzniskeI3dGDXTURGWDr3P2yqamJM888E8/zAJg8eTILFixg8803Z7PNNgPgqKOO4o477mD//fcfmNqKvAuOY+OagAiXoNr/ncjYycDm+8KyZ3pvGDP1rT1sRAaV4xoCO8CNXaJgsGsjMvDsEIIX3rIcTQxJi4813iYeAg1Kfmjwxkxhv/+5AZII4+aoJKkhUbd3Uk4KNO36HzRPfz/GTeNHHvFafhvXlePY2LahWo00o6bIIFnnlrqtt96aGTNmALBgwQJuv/12jDE0NTX17NPc3MyyZcvWv5Yi71K9W6R+4S3kbz+Bwtyf0eAVMabv3d5S1SPZ53SY9P7uglQBDv8BSaowpGcyk5ElzgfcVL6Nkxedw886fk9U8LGsodN6ITIQItdgb/+WyUYcg9WcHlKhKQgiKqaeijWacjQ8At1qvl+lQoFy1R3Q0GWMIW26eGPWjbxw03exy6+TilfiGb/f396RwhhIpRw8z2EE/zPIRrbeE6W8+OKLnHjiiZx++unYts2CBQt6tiVJ8p4/1I2N+fWt0oBpaioMdhXkLd72nAQl+Of34cFLAbBeexgrLDFmjy9Bup/ZySoRfPAKCMuQRJBpxMqPpcnNbKDa1yZ9TjaMYlTivIU/57ctfwbgwc5ZPNDxOL/c5hKa3Ia1Pk/nY+jROXln8ae3pLwyIJrTjhnlkjphMnFHwJjNcpjUwHeJ1zkZGKXWJXQseYnIL7HF3jN59qZLWfLkPxg9eWd2+/yl5Js3f9fHqpVzklRjktaA4NbXwTIUDhuPGe1hhuFC77VyTkaK9Qp1s2bN4pRTTuGss87iiCOO4LHHHqOlpaVne0tLC83Nze/pmK2tXUOi6b6pqUBLS+c77ygbzTudkwanA+fRK7sf7Hs6TDoI5t1M8sQviad9mLao7i3vLYPjbUHK7QJjU0nSRG0hoPP+bulzsuFU82X++MatvcqeKM6lMywRt/X/1a3zMfTonLw7OcfBm7kp5mNbkJRDqrcvIZy9kuzVu7LSDOwsmDon68+yDG64gkevPJGlc+6lcaudGbP1Lrzxwr/JjB5PUGzj7+cexfu/fQe+eedgUCvnxLKgPrApnfwE+N1NucGtr5O9YmdW2kNnNtd3o1bOyXBkWWadGrnWOdQtWbKEL33pS1x22WXstddeAOy4447Mnz+fV199lU033ZRbb72VD33oQ+v6EiLvnZOGcTvA+J3hV4cAYAD7sf+j7tP30hb3/pCEYUxIdtWjwb+ZIO/Mti1c15AkEZblUi5XB7tKA8pxbCwrJgTydo6VYXvPNgsLB03kI7XHChIqV71E0uL3Ko+X+5hxDokGOw8pTtTFA5d8jOVzHwRg+dwHuffC4zn4u3ey8OFbCCtdTNjlMEiq3T/CI4AxkHOrBDct6Ql0AJQjwn8tw/vgWIJgeAU7GV7WOdT94he/wPd9Lrzwwp6y448/ngsvvJCTTz4Z3/fZf//9OeywwwakoiLvpGQK5A/8NlZShYcv772x9UWstpexCjOGREuwrBvXtbCsiCeemE1nZycTJ05kq622olisjXGQqZTFihUtvPTSi4zfdBPO2exkTpv/3Z7tn23+MG7oDWINRTaMyAZrcp7ozaHOgDUmRZLoQnioMVR7At1qnUteIQ4DKm3LeP62q5lzw0Uc9aMnMBkzIkK5awLm3/sHJvL+vhs1sE42gnUOdWeffTZnn312v9v+8pe/rHOFRNaVXzV4234EL2zDmvunwa6ObACOA3feeTflchmAtrY2qtUqU6ZMo1we3hd+jmPR0rKUhx9+GIClS5cyZadtuWf763ik80mmZ6cwzmrGKrmDXFORgVcxMfUnTKKyqES8qAyeReqESVScRJ0ohiSLXPNEissX9pS42QKV9hZKK5Zw0Hl/4+/nHMEzN1zC9M/8gCAcfuPJ3rOwzJwbLmbiWUfAndaa1rqMjXNgM51qpZMNbL0nShEZSjrDLJ5XoHDgt7B+feiaDaO3Ih41ibiqq4PhzPcrPYFutfnz57PNNlMGqUYDw7IMjgMvvPBCr/LnZ8/jwFHv5+j8IYRBrFZmqVlxnNDhxeS+vT1WmIBjqNgxlbg2WuFrjck0sPdXfsE/v3McYbkL202x+4k/5Pm//Zj59/4RYwzbHP4Fii0LGTHrBBmHfPNmPPy7k9nr/B/BQ2USA5nDJtLpxDCMZkuV4UmhTmpOEESUG3cic9ITmNnXkIzemmTaf9AZ16FbvsPb6nUx3yybzfaz5/DhOIYkqbJyZWe/fz/Pc6lW4xHRfUlGtiiK6TAxrG6M1kXwkBVFEBQ7OPi823CzdYSVIi/c/lPm3/tHABY99jf2/sovyIw5njBxqNXfXsexyHW1Y0VVsGz2/epv+NtX9+DmM7dnwi6HMvmgT1PnNFGNh/Yi9VIbFOqkJpWiNJX01rj7XUwcJ6vWnqvNH5WRxWLy5Mm8/PLL3Y8si1133ZUksYHh17XFcSxsO6JSqZLL5Zg+fTrLly8nirpbJ5qamkins1QquroVkaGjWo1o3GZ3bj1lJw761q3c9vV9erXI1W06hfrNtyNJjaZaoz0MLAsKnSvp+MTnqN73AKaxkbqfXM6xP5xNOejESeeIrAzVRIFONg6FOqlZcZzg+8PvQl/WrlKJmT59OlOnTqWrq4uGhgbi2AzL82wMeB7cd9+DtLa2YlkWe+21F4cffjhvvPEGuVyefL6A79fmBZGIDG9VZxRHXjGbjsXPs9Mnv8uTv/lfkjjGyzew15d/QpKbsOqGam3KVysUzzqX6n0PAJC0ttJ+/KdofGkOYW5C921GfX3LRqRQJyLDSrmcYFkO+fxoyuWY4fqr6boWzzzzDK2trQDEccyDDz7IwQcfTDqdIZuto1Kp3QsiERneoighMvXkttyLus1nMOnAjxMU2/AKjVTtupoOdABWuURwX+8ZQIkiovkLsHYYrTHQstGNgOmIRKTWxHH3GoOrdbd6GdLp7v9cd+h/tdm2oa6ujnHjxvUq7+zspKGhgSCo7QsikeHCGIPr2qRccOMOXBNgWZqifrVqNaISeVSsBuLCllSoIxoBX19RJoO75269Cy0Le4vNFehkUKilTkSGvXTa5okn/s3ChQuxLIvtttuOLbfcCt9/57FolmU26g+wZRlSKcNLL73IihWtTJgwgSlTpnD//fcTxzHNzWPV5VJkiPAoEbYv5fVn7qVhi+0pty1n4cO3sPOnvkeYaiaKNN51pCq6OeovPp/o5fmETzyJKRTIX34xYTo92FWTEUqhTkSGNde1WbRoIQsXdq+XFMcxc+bMYfz4TbDtzFoDWz5dwrPaoDIf8tMo+Xkq1Q2/Bpzrwn333cuKFSsAeP3115kyZQpTpkzpGSOoC0WRwefaEYvu/yPP/ulS0g1jaZs/h20O/wKFcVty55kHcNilDxNRGOxqyiCJ45jO+kbq/vwHTFAFx8HP5unUpbUMkqHfR0lkEBmjLjZDnTEJS5Ys6VPe0rIc2+7/Ky6X8km1/h/2k1Oxnzsce/ZkcslsXGfDfyUmSdwT6FZ7+eWXmTp1KqNHj6WqtRSHtXTaBTvd/X8Z1uyoSL55Irt94TIm7f9RDv/+g2SbJrLZHkdTXL6QsLgCxxoB/QxlrcIwpi1dz8q6MazMjqKkQCeDSO8+GdHydhnPVCCJqZoMXVGWJAHXTihYHbByAeSbqdr1dEbDez20WpUkhvHjx7N48eJe5U1Na+8alXJLWIvPf9NBQqxXTiQ79V7aw7oNWV0sq29wdF2XKEoU6IYxywI/yXPdnXDPbMOBOzv8x/6aynw4MyQ8f+uPef2Ju7of2w4Hf+d2MqOa2fqQzxIFZVJxkZAN+50hIvJuKNTJiFXvFnH/fgbm6d9AkmBNORLnqJ/RERWoq7yEdc2+4HcCYHb/MoV9zqYzVLAbaqrViE03ncjy5ct7xtRNmzaNdDpLkhiMgWo17t0NM/bps7Kxv4iNMfdBHMOkSZN45ZVXesp22mkn4tgwXGfyXBvLMqTdAENMJczUdLdSP85y2pVw4z3db6JbHjA8MjfhspNB7f3DU1Bs6wl0AEkUMvvXZ7PlAR9j0vs/zhsv/Jvm7feHnEKdiAw+hToZkWzbwlk6C/PUr3vKzPO3Yk25lcI2R2PddnJPoAOwHrsSd69TAYW6oahSidhxx53ZaaedV5UYkiTmxRefp1qtMnXqVGzb7WkJi00OOz0J/p+9+w6zqyoXP/5da5dT50zLJDMpkEICAVLoHUHpAnIFUVHA7pVrw94vCGLhKnqRi92fBQsqohQREAQRFEILJAQIENInZdrpu6z1++OECcMkJIFJZs7M+3keHjLrlFln9tl7r3eVd1U2B1a27e1Uo9ROr2sQWObMmcf06TPo7u6mvb0dpVzCcHQFPL4X0+CuRq28CJWYRmr8ecSqkULQMipTnQeR5g93DQzfrr1Tcdn7IO0MU6XEK6YUhOXioPJqvgsv1cBj136V9nmvxUvnGH3fZiFEPZI1dWJMchwNy/85qFwvuwM36IKeZYNfVNqILLEbmaytBUuVSu0/sPzlLzezZMkSnnnmGW6++Was3ZyGPF9pJJ79N2zrWyE1GzPxM8STL6Uc7Jp1UJWKwfMydHTsRhw7o27apedacslu9GMHoDLzIT0T9dQ5uE+dTi6+k0yiOtxVHHJag/uS4M1zkWtGnbIW0uOmkGrpGFA+47jzWfnATcRhlYnzjyfS0tEnhBgZJKgTY1IUxTDrlMEPTDsW9fjvYO83DixPNmFzk7Gjq+09Knmew+rVqwjDsL/MWsvixYvIpCIA4tjQXRpHaeKVVGfeRrHlM3QXMrv0+BpjCcN41H2nHEeTS6xDbfgt+BMhsy8sfS+UFkLxYfSSU0moVaNuny/fjfnwWQMP5sfOtjRIm79uhU6Ok795H7NOfh/t847lkA/8Lw0TprH83j8y982fw58wi8hKQhwhxMgg0y/FmGSMJWiciX/819B3XQomggPfC6lmuOlD8LYbQGl44nps8wzsKd+hYGTdRD2wFpwtBAyOVrg6BJxNz7OUqj7g79oKjnJpvw+98mJoOh5yR8PG6wc9R2/4JW7LlwmCaBhquHM4psxHzsxw6uHw94fhtfvDtHZLc4NifWW4aydeiTi2GLeFOed9AxX0suqh21i54C+c9D//JDFuBtVguGsohBCbSVAnxqx8lCY55/2k5rwdR0Woe6+A350DcQjXnAZz34Z95x1UbJqSTWPiUTakMkpFUczEiR2kUinK5TIAjuOw96x2lCkAsjHszuSoGEwI2YOh717wOwY/KbUXdrQNUQK+KjJnN83+MxzCMCaKDEHYQNlkAXC0IeVURnXCmNHGWqiEGlQz7Ye+hfZDziYyinAr+18KIcRwkaBuO6WdJHG11vuvPSjb8na9znMdbFWjXEtEPCobMvWsEjlUyNHq9aDWPFwL6ACiKraaJ9JJCkGK0ZaVcDRzXY1SlpOPP4yVK1cSRoapU8aR7P4hUfKjw129US+I07gdF6CefhdMvQz8ybDh11B+svaE9FxoOo4wPzrTS0SRIYpqQZtRCW5/EN5/uWL1BnjdAQ4//kyaXKJMOXDwXIOJRtZopeNotFajMpnNqxX0JzOS+4EQYuSRoG47JE2SR39f4oGf5DGRZe6bshz8ngYq6uUDO98meOaWCk/cUKZpisPhH8xhMwGxjPiMOHnbRO7MX6Du+w6suh87/Tg48L30hVnkBl4fHEfh+4oNG9azPgiYNLGD6ZMc3N6bMYUMtuOjFEoN7IzjWQskpSEMUA480tlpUF4Mjx8Lja+FGd+DuAjeOAjXMVYGOaqxx5lfgGBTX9HfHoTPfA++cF6KC69UzJ9p+dAbLUmnNHDLjWGgtcKNfNY9FlLojJl6ZBJSIZGV77QQQtQDCeq2IZfz6VwYc+9Vff1lD/+qQPscnz1PTtHdveXAznMdllxX5h9X1F63ZiE8/68qb/vteGJHFliMNEEEff44Go78FGrjU9CzHFvpI+EmKMuaq7rg+4rbb7+NQqEAgOu6nHzyKbjNHyEyPkHBMNQBndaK5kwPKu6GuAANkygETVSqoysJyI6wFopBA5mOD6JWfgV676j9lzsaJn4U0/sPAu/g4a7mTqc1rFi7OaB7wd8fVnz6HLhtAdy2QHHTfYq/XJ4ksY1Owp3NjXz+9MEu1j1Rq7DjwVuvGY/fbqQjUggh6oAEddvgui7P3d03qPzZu8vMOHbra3Ns2WHh73sGlJU2GgqdMYlJjLqMd6NBxvagf3Q0dD0DgFKa9Hv+SZCdI2tgRjjX1WzYsL4/oAOIoohFixaxzz7zCIKdM9rQnOlBP/ch6NqUDMSfQnbO3wmj8WO6IVyuOvhtH8JNzkJv/A02cxBMeEct42ficArl0Z8S0hiYNK4W3JkXXT4O2BPWdm/+efEyKFU0iZ2/ReJWaa3oWx73B3RQm4l+z3f6OO6SBmIVvsyrhRBCjASypcE2xHHM5AMGB2+T90/gvdxNWEO6ZfCOs4msxgl9PFf+9COJ1gq1fnF/QAeANeg7vkRSlYavYmI7KYKXDokAYRjwcqNzSoHvO/i+xnF2bHTNdTWq+szmgA4gWAErLiOTHH37sO2o3mKavsRZFCf/gnzTJ+kqttFVbCM/BgK6FyTcmKs/DulNt5C9dodvfhA+/4OBz/NGQPdqUBp8ngQlg5X+LCGEqAsSWWxDb2+V8bM99jkjDZvafNOPSbLH61J0dW29se8kLcd+ugnnRVvYzDwuxfolIde9p4vOB8CV/W1GDKWobWvwUjaWYdU6EEUxHR0duO7A1vHs2bO3OmLmOIpEQhFVVpFQq2lMdtGQ3v5gTGsN5aWDylXlaTQS1AGEYUyp6lMN7LCvGduVHMehYlJERnHWa2DRzy1LrrHccrllY4/h4ac3P/ec4ywJd3jXrRljGTfTI9U8sElw0LlZsgmF40hTQQghRjplR1g6xo0bCyPi5t/W1sD69fn+n5MmiYkU1lq0q7CpMsFW9qhJ2ASP/LpEaWPM/Ddn6V4e0TjRpWdFxG0XdxOWa5/v3N+PxxkXSkbM7fTSY7I1CQ9SpheLoqwaCaLt+/s2e3ncHx4M+dX9gZx55530Nh/Un81ODLS9x2RXcF2N1jGLFy8iCAJmz55NIpEmCLZ8/FMpjamuIbvmg+jevwIOpv0Coo7P01vcvtGkcdl1qIf2BDY3yu30qylkzqdS3fVZDUfS8RirIuvzz8Uen75aky/Dh8+Cd58c4VJbMxfj01v2uONBxb7TLTOif9brAAAgAElEQVQmGjyGdz0dgKM1uuLz4M/y9K2J2e/UJK2FPNy8gtTX5tHN6JmCKefJyCPHZOSRYzJ8tFa0tmZ3+HUS1G3FK/0ye67Ds7dE3PGVXgC0Cy3TXE75eiu/OmcdUWXzZzv0PxuYc66/09b7jDbbc0xybglv0TXof14O6TbM8V+lMv4givG2F6z4nkOOjVBaD46HdZKUdTOlODFUH2HUGWkXfaUUnqcBSxzblxml08RRkcbyz/BXfmrAY2afu+mO99uu61AmGZKKH0Ut+xiEG7Ht78e0vYuu/I5fjIfCSDseY41S0JnPMPcdA0e2fvkFw8kHVfqzoypVG80zxoyI+90LXFeTXm8I/7EBHunCLCsC4J85mfCMdiqjJLurnCcjjxyTkUeOyfB5pUHdCJjJP3o4jkIHLn7aMHG+z+pHAkwEG56OWLMwoHW6S+fizb2dbbO8EXVDr3euq/HW3o++9ZNwwtdg4oHo9U+QGrcH1tWUoq0HZ1orstEa1E+Pgb6VoBT26C+g9//grvsA4lWz1m5XJ4m1loRv8TfeM/jB/H04DQdgzLbfp1jxCLyDycy6CUVMJW6mnJdzeqzyPJdb7x+8NvPXf9O8dj8HvWlE19ralOGRRimFXdyDuX7FgHKzqkzKOFQYeXUWQghRI0HdEHGUQ9DpcOd3eglKhnlvyjL7FEMcwoS9PbLjHdY/FVBYH3P0x5ponVZbv2CCCLRM7RsKrorRC38Bh18I5S746WuB2lLI1Jt+TTTl9QRbmRGXUmX0LR+vBXQA1qLvugR/3rnAMKalG2FcV+O6EEUhrusShvU5JcsYi5vMETScgt/154EPNh5LHG7/ORmGhp6wYdNPEtCNZZHRTJs4uHzeHpaEZwm3MmV/pIiiGPegVoKfPgcvOgXcw8dh15Zx2j3JBCyEECOUBHVDRFU8fn3OWuJNbdxVD3Vx1g/G8eStJf5+eQ9NU1xO/moLh7wnx58/upE1CwNQMOeNaQ69IEugR/jdvg4YHEzHgejdj4CfHjvgMX3Th8h94HC6ndYtNkocW0Wte3zwm/atRDd1yIgqbMoOGXLLLbdRrVZRSrH//vvT0TGFMKy/v0+5EtM07gxM5SH0up+CTmAn/zeB3k2Ot3hFohgaM4ozjoLr/1ErmzMd3nEy2O0Y+R1u1kKUVKQ+vw/Bb5djKzHeCe1QiYmXFtATW4hH/scQQogxSVJavUqe6+DHCaq9ht0PH7j1wSO/KZJsqG1r0LMi4g/vX0+1z9QCOgALj/2hRN9Ki9Zjd7PioRKEBrvfuyDdQn90/YJyF6p3BY1sqGW6fOlrdRYz+z8GFroJaNlDGvibOA7cf/+/qVZrmR2ttTz00EM4g3fuqAvGWLryaaodXyXe71miuU9QyL1nTKXcF0PL0xFJ33DcgfDP/4O7roQvvxvSCVs3yZbK2mCTGveY8finTyJ+pIfK95biHtA8IqeMCiGEqJGRuldIa4UTeyy9scqCn/bgeHDQO3N0zPX555W1zcrTrZpK3+YbeVC0VPssSjNg75/OxQF7zHC3aw2P2LqkE6I6H4VqL0w6EFYt2Pzg3m+EZ/+G7ltF8uivU44GRiKVyCF12MdQlR7UY7+Bpt0wp32fom1A1CgFvb29A8pqa9gCoH635yiUPKBxuKshRoE4NkybEOI6Ht/5vWJCM3zkTQpfVaiXvqEoMpiJSdSKEsEfVkDKIXnRvpQTSnZ3EUKIEUyCum3wPAdlFVZZwjDGxUUFLvm1EWHJ8Pevb27k3n5pN2d+fxyZNo2JYN7ZWa5917r+xx0PUi160Gauux2SIIrqc23SSJImj/7FidAwCd78W3joJ7D6IZhxHMw6Ba45HdU+D22rwODRmJ4wS+o1X8d/zUUYqyipxrrpXd8VrFVMmjSZ5557tr/M8zwSiQSVit20gbfGmNq5srXMk0KMZg4B08eHfOeDLlpZmptSrF9fXx12eRvhH95M4sBmrIKiZ2WUTgghRjgJ6l5GwiZ4/u8By+6pMPXIJNOOSrLqoYAbP76WuWdlqBYGN1pXPFDllK+24mcUWEsiq6nmY9yk4thPNVHNG+a/NcvjfyzipRRHf6wRr9ESSRfoq+I4GlY/CCaG3uXwsxNg7jlwyAchPa72c1TFzDufQGUZkAVgE2uhFHmU+kedJKB7sTA0zJs3D2NiVq5cSS6X47DDDsMYhe/HxHHI8uWr6OvrY8aMGSSTGYLA4nkKx6mN6lmriCJkSqsY1YyxYMJBVxClqJvRriCKCV5YoLHrt1wUQgixgySo24pK3nDPFXkW31AC4MlbypxzzXj+dkkP1kC+M6Z9Xx+o7UU3bqZHpcfQNMXl4V/lWXpHhQn7eJx0WQvZNodKr+HRawvc8bVu5p2d5czvjaNxkkOcCIleuv5L7DBjDLTusbmgmocHvo+NKqjx+0K2A3PwB4j2eD3hDmQ2FJtZC5WKYe7c/Zk/fz+MgcbGLIVCAWOgUqnQ2dnJmjVrePbZZznssMNob+9gyZIlLFnyBNZa2traOOKIIymX66RlK3Yaz3Mwxo6JbIq+0mSsg9lQRTf7VBxL2crIlxBCiKEjQd1WhCXDEzeVBpTFoaXUXWuAPHdPhUPekyMsGWYel2bt4wG5iQ4t0zzu/mZtSmbnopAHf55n/3OyOAnNc/dUiAN46JcFHvplgXffMoFIUokNCWshTE5AHfxB9P3frRU2T8MeezFVEui930pF56hK/PyqWAtBUDsHlFKUSiXuu+8+Ojs7SaVSHHDAASQSCZYtW8aiRYsYN66NJ55Y3P/69evXs3TpUqZNm7ld+8mJ0SfpR6S9blT3zeBPwmQPpbfUMGpHb621pDtDSv/9MFQNKPDfO4PUYc2jMrBTCjLWwTUKFATaUhqFn1MIIUYaCepexksTmqxZGDDjmCTP3FnBxnDPlb0c9dFGfnv+uv5ki1MOSnDcl5q58RMb8VKKA89r4C+f66Khw+HoC5u45QtdADROclB1mjVwpMpHabJHfBH/iE9AVMZ4OfK2cfNIgLQrhpTnKRYsWEBnZycA5XKZe++9l+OPP55ly5ahlCLcwsZc69evY+rUGbu6umIEcBxFRi9DP3II2NpFU6fnktvzFnqK2WGu3c5he0Kq33mqFtABWAh+8iypQw6kPMKTHqeVQ8IobGyJXUVRx9sMvhuVR3DV05QXdIMG77SJNJwxibyVOZxCCLEzyZYGW5HIavZ/+8BGRli2HPPJJua/NUPLdJf935blH9/uHZA9f8UDVRonOZx0aTNv+lEb9/84T9+amFUPBbRMr8XQDe0Op13RiknIsNFQK8QpuuIWutQkeqKGMTG1a7goZfsDuhcYY6hWq7iuy7777ksikRj0ukmTJiGXnrEp6ZXRyz/XH9ABUFqIU316FG/rYjGdlYFFkYVgZI9MZnHQf1pD6b0PUH7fAuKvLyFnX74f2Hcd4n9sIF7QXSswEP5pNbqzOoqPrxBCjAwyUrcVfkYz/9w0045KsvzfVXY7OEFhveGXb+5kr5PTHHheA+Nn+1R6BwcNQckSVgyL/lyk84nNIxXpFod3/2UCjr8pNXTVQ/kxkZEeTFF/rIVx48axatWq/jKlFNlslqOPPppksrZv48EHH8IjjzxMGIZMmzaN3XabSrksw6ZjkSIGW4HWMyG1FxQfhe6bsaa4xf0jRwPlOziHthLfu3FzWVsCmxi5H1gphdsdUb5+87kdP5knumk1idPbqW4lE6ZjwCzsGVRulvThTBon2/YIIcROJEHdywh1QMNMxf77pFjwwxIP/LgAwKPXFoEib/pxK/PfmuW2i7r7X5Nq1jRNcVn85yIAp39rHA9dk8fxwXoRjtLc/8M8j/2+iOMrDrsgx8yTEoR68DQ1IUayKIKDDjqIQqFAb28vrutywAEH0NnZyfLly9ljjz0YN66N8eMncvLJHQAYoySgG8NCmyO5x49RnT+Cwv2QOwomfRyVmEmcH9kjV6+Uyrj4751BmHKJH+pGT82QeP8M8k48YqeEO44ifrY4qNwsyeOc0r7V10XakjyoBR7oGvh+c5pk1oTYJq3VqF1bK8SuIEHdNsSxJYoMYXHghWbSfj5+2iGZg+O+0MyTfy2R63A49D9z3PTpjax+uBakLbm5xHl/aMfNWqwf8fzfIx75de1mGYeWuy7vZfIB4/E75GIm6osxlmy2gWOPPZYgCAjDkKVLl7Js2TKOOuoo0ukMUaRq50//bDv5jo9laa8btfQ/ofe2WkHPbdB2Luz21eGt2E7WS0jy3Cn4b51CrKFXG8wI3scxjg3O7NygcufgFqoe8JKVA57nkAyAANxDWjFP54nuWAeewn/L7kTNrozSiS2yvkslmWBpAGEmQ1ZZ3CDErUhHtxA7SoK67RBGEfufl2XRn0qEm1KxH/aBHNd/eAPF9YbJBybY/bAEjq8orIv7AzqAOITFNxaZe34CIsUzd1QGvf/z91XY602e3PRE3XAcjeta8vk8oPD9BKCYMmUK8+bNAxRhiGzeLgZwCDYHdC9Y/yvUlIvqag+3HWUtlG1M+YWlpCP8tLAWgrQi8dFZBD95FluKcY+fgD52PGG0+f6mtSLtebgbAqrffRrbG8IbJuG9bXe8t+wGCiqOpSJZnsUWmITHHTbB5asVn2mF3TxFQiueUz57JSxK0lULsUMkqNsO1gLpiPOvn8DTt5Xx0orcRJfi+tqdeeWCKisXVAE4/7oJg17vZxVgQcOkA32W3lEe8HjHPJ9YbnqiTjiOBgJuv/1OSqUSnudx5JFHksk0ks22Ui4bZERObInFQSkXXpwJ0UmDckdtQFevSjYmsV+O5BX71ZKcRJZ4US+Nk9JEGU2kLMm8Jbp5OXHKIfWfe1D+3jMEP3yWRKNHaW4DUTRyp5iK4VdN+Hx8ueK6yfDuNbA4gISCr7YppmV8khLUCbFDJAXddoptTOhXmHmGy9QTHJyEJdEwcKF7yzQXP6tJ5jb/WVPNmr1OThMEMWEUs+dJKaYeXssIqDTMPTtD0+6OTL0UdcN1Lffe+09Kpdo+jmEYcs899+A4yLoZ8bLKUQY78RMDyuyUiylHDcNUI/FyqnFMyTOYZ4uU3v8A1a8tofyhh+DGtaQqUP7IQ4R/WUNw3UpKX15E8n0zQEF85zoSFlxXmhhiy5SCNbHiHU3wP121gA6gauFj66A6WjMnCbETyUjdDgrDWrej68Op/9PKjZ/YSDVvyYzTvP4bLZAOOfe68Tx7VwWtYepRSWIv6J9uE+gqx13SiA0VSiusGxOo6jB+IiF2jFLQ0zMww10YhptGm6URJ7auVPFxx30Er+WNqMK/IXckgZpIsSS3opEqFWqq31s6YMpoeP1K/GPGDyizvSFmaR49qwHvDU2YZ5bgL15C5ohDqWYbqTj+rq+8GLGshQ4X9vTgt32DH38+gPmSOEWIHSJ30lcoMjGNezqcd914oiq4SYi9kCA04ERMO7G2s3gQVgbe+KytZbocvH2XEHXBmNpWBhs2bOgvSyQSOI5DGMoNWLy8vnIGrWejU/sQVwxW5l2OaFqB7X5J0goDNtrCcdMK/5RGSld/g8oPf7qpTJP707W4RxxJtKXXiDHLsYY9fYdj0vBs7+ZyF5jpWUxZvi9C7AjpVn8VYhsTuFVMpkrgVInN5ugtDOP+UT0hRpMogsMPP4LW1lYAGhpqGTBDWf4gtpMxliiKJaCrA1VtcY8dP6BMT0mjGlxwNk+RU60+zoEt6Onu5oAOwBiKH/0UqcIWhmPGMNmMHVLFCq3a8IVWOD0LCpjowvWTLAnJfinEDpOROiHEDjHGEoaKI444CtfVRJEhipSspxNiFCqbmKa37Y5uSxL9eyN6Rgb/zbtRdi3pqw4gvLMTlfNwDx9Hr47JVgc3xs2GjSgJ4AFwtcU1eQqrniHZNB6daiYgPdzVGhZxbGgpV6imElw53uHK8RZtDOlKlVg6xYXYYTJSJ4TYYcZYqlVLOp2mWrUS0Ilh5XkOsU0R2gzWpvA8Z7irNKr0mJDgpDacz+6FedsUenREOY7pScQEp7VTObqFbhsSxwabyeDsOWvA65PvfQdBOjNMtR85HEdh+57nqRu/i1KWoG893U/di0952y8epUwU4+VL7OZDqrdAIl+SgE6IV0hG6oQQQtQt13VY25Pi7ZcqHn4aDttH8/PPalobKgTSOBwy1SimqhmwRYG1EATRgOcV0w003nYD5a9/i+jRx/DP/g+8s86kz0qg7cRFetY+g4lDbvv8iZg4ZOpRZzNu1oFAarirJ4SocxLUCSGEqFvlIMGpn1E8t6b2832L4OyLNTd9xUfrsTsCMlzi2NKbyuFf9CUS1QphOkNZEqQAoLAopVn0h2/2lz13129on3sMEw4/V9bhCyFeFZl+KYQQY4DrOps2jh9dqpHuD+he8PDTEMSj77PWC2uhajUlP034ooDOcTRJB3xvbB4bncyx8ekFg8pXPfhXlJXEIEKIV2dsXlmFEGKUc12N52l8rWk2Lql/95B5okST8kZV5j3ftbTkBpbt3g6uI6NDI0kmKpNe8C/iCz6E/u5VNAWFUfU93B7VwNAx/7hB5ZP2PxFP9jkSQrxKEtQJIcQo4jiaFuXRsCIgs6RMQ+xQ/faTBFctpfq1J6h+4TFydvTMvE8nAn7zJUvDpgSCLTn49edjGnzZY2NX0lqR1IaEtig1MFjzXIW96S/0nvgGqr/9PaUvfpne172ebKUwTLUdHsZY0h17su9Zn0a7HijF1CPfzOSJJ+BtjMZckCuEGFqj584uhBCCRuNSvnQR5plag1nlXNJf2pfifz8OxQizqoxZmseZlR72rKWO41AMfZRSJJwQzI4HYlEYst8MWPT/fEoVyCQh64aUIwnqdpWUjfCff57S/3wHlU7R+KkLKTaPJ9zUb5wo9FH8xhUDXhM/+RSsX4eaPI0xtduBn2Pv2Rew1yHvxSqLWhwQf3U14WtCvHOnUK1G234PIYTYAgnqhBBiFDFP9vUHdAC2LyK4dS3ekeMI/7q2VlaMUcM8KBDjc+8ij49eqdnQC+8+VfPpt3q4trTj7xWH+DrE3zRaV5UdNnYZ19Ukli2n66CjwdT+8JVf/Y6WRQvoybZsepZCJQdPL1SuO7YCOmqjdbrbIfrqMgBe+Pi6PYkda38MIcSQkumXQogteukUKlEfzMbqoDLbE6Cym/rwUg7unCaiaHgjn0LV48wvapZ3QqkCV/5e8ds7NK4rfY31JBVVKV3x3f6ADoBymeC6P/XvF1jJ5kh/5aIBr/OOPAzT3MJYE4YxziGt6ImbtzBQE5K4R48nCEZ/9stUyiPMpnAyiWHvWBJitJG7pxBiAC+hcBzIF/rIpDJYrQjLcvetF+7BrVR/+hy8KOugd/JEoke6cY+fgH/WFApODMMY07mu5r6HGDRKc93dijcd4+ArmYJWLxwbo9LpQeUq19D/7ygyxAcdRPPC+6n+8U84e8/GPexQ+hLZwV+CMaBPRTR8ZQ6sKYMBNTFFnxMP2ANwNAobM9xQUvyqSzHLt3y6xaOpXB4TwawQu4IEdUKIfomESzno447b7iCOazfavfaZzR4zZ0pgVyeqniV9+XyCXy7DlmP80ybB9DTx9CRWQU9ssGZ4G9JxbNl7au3fZxwFH3lTrW3flLX4LmBcqibB2i5oa4KEE+IgKd9HosBqUu9/F5VfXYvN5wHQkyeROOkEii/ad63sJKh07Ib3kY8QGUsxMmMyoIPaFMxeQvREj5Ry8CNLQ0lhEy4Fxwz7WtedwUsn+HGf4rMbaveRv5UUNxfhH1NSJIOxlTBHiJ1FgjohRD+jAx68/8H+gA5gyaInmDVz1jDWSuyIookJxmnSH5oJsaWchGp1ZCUNsdbS0RzznQ8rJo5TnP4ZyJegIa24/iserY0uR/yXplgGreHbH/I562jQspfXiFN2EvjtE2i++68Et9yGymTwTz2ZUqZh0MiTtVZGZV4kZTTqtk5Kv10OBnRHktwlc+hx7KhbX9fjuny/d2DH4PMhrI1hmvQXCjEkZE2dEKKfQlEoDO41DcORFRSIlxeGhl4T0quiEZtNz1MVTj/S8l9X1AI6qP3/bZcoOrtrAR3Ulmp9/CpFNfKGr7Jiq6y19PlZwvHt+O86D+fsM8k3tlKJpaW+LV7VEv56ef9UaLOmQvDzZSTV6GuaKaBpCx8ro8fsgK0QQ270XTmEEK+YVYrJu08eUJZMJvF8aVCLoRfHsK57YNnaLmpTMF8kjKAsg3QjljGWopeiRyXp00miWFrp26K1wq6t4OzZQPJ9M0hesAfOfs2Y54o4I7Mf5lVprgZ8Y7zFeVHZKRlLDvmuCDFUZPqlEKJfVIa5c+biOA6rV6wml8ux/4H7YyLNsGbWEKOS5xhm76554vnNZftOh2poqfXt10xug0xSGn9i9DDG4uyeQZ0xmep1KyC0+Cd3oDqSVDxGXdKUajVk34zDE9NcbivBnj7s6UEqX5Q7ixBDRII6IUQ/a6FSsuw1ex9m7bUnCo0NHcJhTn8vRg/P0aQjjQotJCOuv0zzjss0/1oMh+0DP/2soSEN550If/m3Yt9pcNXHDGm3QjzKGrpi7FKqtl9k+RtP9G9WV7l6KelL5xCO0tErt1ihWcG5CY8oiIlKRgI6IYaQBHVCiAGshaBkoX+ijNx2xdDwHE1mfUzl0oXYnhDV5LH7xXP43cUJYqtRGJJOFWssX3+fz8XvcnAdQ0IHxDKlT4winucQ3beel8ZvwV/X4r13KvEruO4qNfLXp1kLlYqs0RZiZ5A1dUIIIXaJdKypXLYY21Nr1NmekMoli2i1EQlVwlcVjLFYC8oGpJwyHlXMMG/BIMRQM8aiJqUGlespKYzase+76yiaHI/mqkOLcWm2Lo4jzTshxhoZqRNCiO2QdBycGCIXqpHMA9xRrqtxKmC7BmY8sRuqKGMZkEFBiFEuigx6dg49M4t5upZxWLUncY9vJ78D2YaVUjQYF9ZXKV+9FPNcEb17msZPzaYvqyRpjRBjiAR1Qgixie87ZAMNsQWtCBOKYhSSsy7R9asxS/rwD2wm9boJ9NpwxE91Gik8R5PpNpiuCqo9iV1b6X9MdyQxWtLfi7GnT0VkPzsb3RthYwMtPn062qEkKUmtYVWFytVLMStqe4OY50uULllE+rI59DEKU2kKIbZIgjohhAAcx6Ghoil9bTFmaQEyLskL9qBx7xzVrz1B/GQegPiJPtyVZVLn7UZptKWo20nSsaZy0WOoVp/UR2ZRueppzMoyekqaxGdmU3DjUZftT4htMcbSR4RqAnCwdscCOgAdAb7uD+heYNdW0JGVVp4QY4ic7kIIATQYTeVHz9QCOoBiROVbT5L9wUHES/MDnhvdtY7UubtT2sL7iMFUYLDdAbY7oPKDZ0i8eXdUq4+akKTXjes+CUoq5aG1plIJiWNJLCR2TG3E/5WdA6EHvqdRbQns+mp/uWrysI6MgAsxlshKWiGEAHRkiZf0DSyMLWZDFSanBxSrjCtTL3eEryFdWzRnnitS/uYSKlcvJcTWdRDkuppxuT4yfd8nteaTNPtLaEwXh7taYgwJopi4xSN14Z6oRg8AlXNJfmY2Jbd+zy0hxI6TkTohhACMp3D2zhHdu3FzoaPQ4xJ4h7QSPr95XM5/93QqrpUpg9uppA2pT+5F5fIlUIpRzT6JT+xJwTPU85KfxlQ3atEJUH6iVrD2f/Fm30Ai8Tqq1Tr+YKKu9IUB6SkJ0lfsB4HB+JqiExMOc4eJ57mUUj4ximwUEpWCbb9ICPGKSVAnhBBAXsU0vXsG5Y0B8ZN5VINL8oKZhI6FkyaQOmIc8XNFnFkNBElFVXbC3m6BMTAtSfrK/SE0WF9TdAxRHW9qr7VGBc9vDug2USsuIT1zHtVq0zDVTIxFpTCqTQf3AUbAGtWUz3LX5+INiu4Y3t/kc1Sjh9crI9lC7CwS1AkhBBDHhnxCkf30bIgtSiuqCSgEtfTiKqdw9m8gjo1MvXwFAmMIlNnU6DTD3+gcCnZLQal8OYQo+T6HPacobTod7iwp/jARXpfyCMqy+bgQO4OsqRNCiE2CIKaLkC4nYqMK+wM6AGstUSQBnagxxmCT0yE5c0C5nfw5ynHbMNVKiOHn+w5/K9Ef0L3g+72KvOsNT6WEGANkpE4IIYTYQXEm4Bfd/+Z1e15LU/dfSFefhwnvJnamUCnKSIQYu+LY0O4PLh/vWBJbHN0WQgwFCeqEEKOe1grXraX3jmPqOuOiGH6uq1lQfpzPLb8cB4ejGw+i1Z3IBbqV1lIWmYIpxrI4tszzLfslFA9v2mUhp+GLraALAXL1FWLnkKBOCDGqua4GQhYufIwoipg9e2+SyQxhKA1v8coYJ+aPG/4KQEzMnb3/AmBaYjLvbHgLQTAaFgwK8col8yVumJRmSajYGMOhSWioVIiMhHRC7CwS1AkhRjXXtdx441+IN2WrXLVqFSeccAKum677Ta/F8HCMw/zM3tzUfeeA8jnpveQ7JQRgjMXvLTJfK5RSxL2mnncvEaIuSKIUIcSo5XkOK1eu7A/oXrBkyRIcRw1TrUYvrRU5XJpCh2bjklbOgMddR9GIS1Pg0GRdki95vF6EoeGNLSfxhpbjmZaYDMCJTUezd3KWTO0V4kWMsXJOCLGLyEidEGLU8jyHjo52jjjiCBYuXEg+nwcgmUwCEtQNtUbrUvnKYswzBdDgnTqR7BmTKBChtaKhrCn/92PYzgq4isT7ZqAOaqJs62u6oudomvNZrlj7X5B2YI8UfckqquTJajohhBDDQoI6IcSolEpp1q9fw9KlS0mn0xx77LEsWrSIFStWsNdes2Xd0xBLOg7hH1fVAjoAA+GfV5M6ZjyqWZMymur3n6kFdACRpfq9paT3P4hyHc0ZUQqyVU3pYw9BoTahTE9J0/TlfemxkvVSCCHE8JCgTggx6iSTLmvXrq54pusAACAASURBVOJf//pXf9mqVas45ZRTmDNnLmEItk5TazuOJpvxMRby+cpwV6efjsE8lR9UHj9fwhnXgB8Zys8VBj5owPSGqBZVN/v/JRyH4Hcr+wM6ALOihFnSh7tPhiiqz++VEK+WUgrPczDGyHkgxDCoo/5RIYTYXhFPPfXUgJJqtUpPTw9hSN02OJoyRZoTz+GtvJDEuksZ19BFKqlpSJVoyWyE6goyyeEJ9CIXnENaB5U7e+XwdIC2j+Lslx34YEKjm/26CegAlAW6g0HltidEKZnSK8amOOmzOpPm69UkN6oUYS6N1nI+CLErSVAnhBh1lFL4/uDdb33fx5g6iiBexPcd3Hgl6pH5qLVXoVZ8GbXwUDJ+D4nn34Xz8DRYMJVU5xdpTBd3ef2CKEa/pg33pHZwFarJI/GJvagmIekV0c+fTfKtPu7BTaBBtSdJX7wPRV1f02ADLO7pkwYWegr3wBbCsL4+ixBDwfMcFuLxYFXz+qxihq+5rqSppJPDXTUximmtSCRcPK8+E27tDDL9Uggx6kSRZv78+dx6662YTfsitbW1kU6nKZXqc5QukyjBsq8CLwocMvNhw7Wonps3FRhU5/dxWs/GcQ7e5en1e01I6i2TSZ29GxZL2bEEcUxKKUzjKej8paTeegj2Xa+BaA1mfEBQqK/bUBwbwnaf5Jf3JfzjKkhp/LfuTtEzyK7KYiwqeB4tWnNxF9xbBgf4QLOiknZIKOpqJF7Uhyid4BlcftanmOfDf+QsyUKpbjtth0p93U2FEGIbXFfjReAnMpx++umsWbOGdDpDU1MjlUq9X/BfMrkiNQuVv2fQs1T+nziNhw7aymFXKJuY8guBZwwRCe5/pp0f3vh9Zk8u8vbXbqRjzfHY1F6E0Y93ef2GQsnGuFMT+B+egVXQh+nvPBBirHEdxW96awEd1LqdvtsN5+agSSmsRHViCLmew19Dj3PXbp7e+8M+xc3tSfxCeRhrNvwkqBNCjBoNykU9nie6vRM6kiT/YzITJ04hCOK6HaF7QbGaJjf5s9B1PdhNSTqqy7Gtb0RtvG7Ac23jCSNi3aDnOdyz0OWsL74QjDbwo5sbuOtbdzOuEYrFNNTpJgBRtGkz5fqsvhBDx1gWVAcXP1qBmVqN+dETsW1xKkHVcwktJI0hVa5s9XuT930uXTtwvebCKmy0mo5dUdkRTII6IcQAWmsSCYiiCLC4rkcQMOI3kPU9B3VfN9X/W9pfFv9rI8nL51MaBfPigiAmTE/Gm78QOn8MXiu0vR1jE+j2D6M6vwc6gZ38JUJnKiYY/oZUoepx2S8Gji4uXQXLNk7AdUvSgy/EKOAHEadlXO4qDWxoH5GyxIX6v/aKnStMJ/mfgst3uhUG2D+h+dPENIm+La8NV9gtJgSRPFUS1AkhXiKZVPzjH3ezYcMGAFpbWzn66NdQKg1zxbYhUYXwz6sGlNmNAXZDFdXmjIoAoreUwnEmk+74KtZaCn217vF023+T6PgkrutQqKSolEfGwnEFuFuoiiTFE2L0iMOIcxoMi6uan/cpchoub7M0R5GspxMvSynFGuVwRffmm8JDVfh2j+IzKRdbjQa9JhuEXNTq8OY1m0O7A5LQMgo6b18tyX4phOiXTHqsXr2qP6AD2LhxIytWLCeV8oaxZttmFajM4H4qlXQYTXPk4ri2P12hsHm+U6nq0V1sBL+DSnVkBHQAaT/g4neZAT2oc6bDpHF2VATZQogaN1/islzAs1MNj+wWc7qu4Ja3MCdTiBdxHMVjW/ia/LuiqOot38uiMOYoJ+L+3QwXNll+PMHw546YZGnk7Ns6XGSkTggB1BKM+L6mp6dn0GPd3d3svvv0YajV9qt4luw7p1H+3ML+LIR6XiMm62Dt4N4+sfNFkWHutJCHf+zx69sVs6ZYjj/IktRlJK+IEKOLLgckqe3hKF02YnvEseGw7ODyM7KWVBSztVRfbrnKdB1wUdrBGEvYF8s4HRLUCSEA11VUKnmeeOJZ9thjD5588skBj0+bNo1qNRym2m2fKDJUJnikrj6Q+NEedHsSOyVFnkhaGMPIIaCjMeTTb6lNgQ3D+GUDOq0hkfAwxlLdwtQbMfJ4riYZ1oZjq15tz0IhhNgWayEXhvyq3eej6xU9Bt6Rs5yTMcT5l7/+yz1iMAnqhBBobbjjjr9hrWX69OkceuihPP7441hr2WeffchmGyiXR34/WMUaKh64hzZijMUYueCPBNZagmDrx8J1NcZYMskSCdUFG38Piak0NB5LT6lpRGTyFFuWVg7uoiLhNcuwoSV55mT8A5sobLWPXQghNnMrASd6hgWTPZRS+GGEygfDXa26tFOCuhtuuIGrr76aKIo4//zzedvb3rYzfo0QYgi4rmbt2lX9a5xuv/125syZwzHHHIPruljrUC7XV3AkQUB9MHgUAp+774dZUxQzJ4aMf/oQMH21J2Tm07jXDWzMN+6S+ngeZLw+tIqJjEeh2jCk6dgdR5H2C7g6JopdikGmrtO9a63we2LKX3+iv6z6f0tJfHkOemoKE43s0X0xtmmtaDAOqhCDo4hTmoKS5C7DIQ4jkmF9tTNGoiEP6jo7O7niiiu47rrr8H2ft7zlLRxyyCHsscceQ/2rhBBDwBhLLpcbUPbYY48RBAGzZu39siMsI0kqDkgEZSiVsNkGCukG4ljuziOV6zo89GyCkz+peGGP9DceneXqd/6EcWvOqhUUH0EFz6P1/P7NvTPJMgmnBNFGrNdOodJAGL36dJoJLyZrF6AXnQvhWpzsQbgzf0dPuXVIAi/H0TQmVuE88w7I34+bOxJvxk/prbTV7ffUdR2ie9cOKo//1sna/5hJe7PBxDJiJ0amRutS+dJjmJW1Dav1vEZyH51FL/VxzxPipYY8++W9997LoYceSlNTE+l0mhNPPJFbbrllqH+NEGKIGGPJZhuYOHFif1kmk2GvvWYThvUx4pU2VdQ119A1fR+6Zu9P70FH0rBuNVpy549YxcDjE1dtDugArrvbo0cdDupFmVbjEnrTnSqTrJDsuhrn4Wk4jx2I++g+5NxlOM6ru5X5DmT9PHrJGyDcFKQUHkA/937S/tDs5ZFN9OAsORXy9wEx9N2F8/SbyPj5IXn/4WCMQU/NDCovdWT52q80pTCxS+qhlEInPVTSk3NebJeE5xDe2tkf0AGYR3uxzxRf9fVEDC3H0SQSLq4rx2VbhvwvtG7dOtra2vp/Hj9+PJ2dnUP9a4QQQ6haNRx44MGceuqpnHTSyRx33AmEoaqbaSiJcpnCJz/PCxGCWdtJ/oILyQTlbbxSDBdrFV19g8vLVQVq0yQSfxKkZ/dPp004RfTKizc/Oe5FP/efpL1XHhg1RGWcn/0cu/45MAMDONV3D64zNGs7HKpQeXpgYeFBXF2/UxSjyKBm59B7bx7p17un6Zk/nmvvVMRm5wdYynPoyqb5VD7BBX0Jnk2liRP+Tv+9W6K1kqCyTmirsMuL+G+eQurze+Ofuzu4YJ8vyjEcQcJMige8FJ8rJblLpwizqeGu0og25NMvjTGoF21KZK0d8PO2tLZuIbfpMGlraxjuKoiXkGMy8oyEYxI+9zQvjUDjRU/gRRFtHcNfv11pJByP7RGE8IEz4PM/3Fw2tR3aWtLAqZDYHSZ+CJWYTH8/YeEZBqUyrSwlmbAkG3b8c1tjqPzkOoof+zSp198OOgnmRXsdNRyC56Vf9d+0ra0BggK4zRB1b37An4Tj+nVzzLbGfHJv+tZFRIHluYLH2y/2mbcHZFKatqad+9mWh7Dfs1Da9LX4fV6xYGqC/XMvP0o41H/zcncnvSufJA4qNE+bS7qlfUjffyzY0jGxoQENaieMnpn3zCC8Zz3hLWvQezeS/d7BWBOTbEoP+e+qV8N5bSoauHQDfK2r9vNVPYp3NGquGN9A08jZjnVEGfKgrr29nQULFvT/vH79esaPH7/dr9+4sTAiFo63tTWwfn39TosZjeSYjDwj5Zi0tLejsllsodBf5p9yElEiRddOqp9SkEhoyuUilUqVlpZmwhCiaPiuXyPleGyv809KMaHF4ZpbFXtPtXziLZZ00qO62w+IrUexzwCbP09ztg33JYGRbTmLYiVJuXf7P7fjaDwXvDCg/PNrACh+40dkP/tD9IYP194/tTfx9B/Q0+djzCv/m75wTHwvTcOeN6J7bqpN8ez5K2bmr+krZgl3oO4jkeMoik1pPvq/irsXKg7f13LVhRYdl1i//tWfD1orKg0Z7qnA6hBOzECziUgFIX+KU5TsizqSgW9utPxvroqpbHkUdKjPkyR5bv/CcfQuryWMSbV0cMoV/6aimoDatUIpNSLaNiPVS49JWjn460PCG1ejWny8UyeS92KiIVp/2qg9op8tI/r7ulrBg92Yxb34/zWzrq6hO9Nw308qDWmu6B4Yvf2sFy5qMYRdRZSq9eVqrahk09xYVDwbwbkNlnFRiFOp3wyaWqtXNMg15EHd4YcfzpVXXklXVxepVIpbb72VSy65ZKh/jRBC9IvTOZpu+BP5D3+YaOmzJN5wGtmLvkje9yHcOYkaEgnNfff9k3Xrao0Cz/M48cST0NoZlY03pWqJMYyxxPHQrLV0bZkzDnM48UAX3zVgQqpVqFYd2MJWsoVKjoZ97sZ57gNQfhrbcibxpC9SKW7/rawhVcYPF6PX/gSbno/3q2/SdcTZlH/8G0xfkcxnfoe71xRCN0uh0tifoOWVyCQrUO2jJWuwuFDug/Jj2MRUmHMPxaCVsFr/35U4tuQSJb73MZ/IOLhOjK+DITsPyg1pLt0I5zYqDPB8BNpzyXmGnLXAwNlAjRrULvqzuq5m7YN/6w/oAMpda3jq5u+xxxmfI2nzuJV10LsSJsyhqHKoTdP7wshBKcnW+1Kuq/GWVSh/6fH+sujOdWS/vR89DM10ZSeGyt3rBpRFD3aTMHYnLEwSr4za6hazlVyGtTGMd2qBzHGrNEs2xXCXdyn+NtljvhuNuXNryIO6CRMmcOGFF3LeeecRhiFnnXUWc+fOHepfI4QQ/fqAxvnzyV37B5Q10JClnEoS7qSATmtFqVTsD+gAwjBk4cJH2W+/A9HVWllVD10ANJxSSpOogHmyD9Xmw8QUfQxN6u8oivFUjN2OQxVG0GunkZ76OxwdUY0zlAvbfxvzPYVfuBn9zDsBUPwCJ70vTb/9Nl2HnUn1dzdgq4bU/32HvJ9hS4Hl9mrKFHFXfQ7W/QwHsOPfgWo5FbpuqIUgG64lNedBKtXcNt6pPhhjcVUV13nh56F5X6Wg1yjOb1KcthI2bvqeHJNW/Lzd53WuZTcXlm9KWNig4WPNFlPYNWsVtVaUNqwYVF5c9zwplce7/VOoR39RK/RSNLz7Lgivh3At4cRP8YDZjSnpFOlyBbuTrlf1xo8V4e9XDiizvSFmaQFnVmrorqm+hsqL3stRvLSDQAyfRBjywSafb3VvPibnNFgqKPZfpijZ2tH61njLiRn6gzoLfGmj5rfjPLyoOix1Hy47ZZ+60047jdNOO21nvLUQQgxiraUnDHAnNKO1IgwN1uy8BpJSikplcBKWtlwryTUR8QNduPObSbb4xK4mr+O6Hb3zXE2iM0IrhQ0tqmRQqytkJybJD0Pq7zg25OPBGRe3R8rtRa+6bGBh6XHcue003ncnOpslbm6hkEgPWrq3I1xX4xT/iVr30/4yte4nkDscMvtB8WEIO1GVJ9H64Lr9buwKSil8Bd/t2hzQAfy9BM+EsF9U5d4pCe4qQ8EoTspYMqUKu+pPGoYxux95Fo/84ksDrjl7nvYh3Grv5oAOICyjbr4Qjn8jrLsMf/0vmTH3CfZbPp07JicZH5fku/CChEY1eeiOFGZVCdsXoRK6fz/VVyvwIPGm3aj+Yll/mX/aRCKXV9OXI7bB81zC7dyPTlUCPp5z+P/s3XeAnFW5+PHvOW+bsjtbstn0HhIIEBJKFIwCAgIi2C4q9u4VC17FLuBPAb0WxN6Va++9oYiChSZiSAIBQkghhJTd7E57563n98cku9lsaMlmd2bzfP6BPTOzObMz8855Tnmek7OaX1YUz8galucU521WA2doDfCubYobZsGn9ziunFBPxnWoOShBnRBCjIXR2mqRJCkTJnRh2zZxXP+C8jyPeR0ziX/+EPaSDvyP3oXpi7COaqPt7QvpU/GIDUhGUzbVqNhQ+eBKCOp/X2txO5mLDmu6SW3DHpk195Aqm/L8eYOvzwG+TJalUTv2UcqndCtkD68HdQC69XGtUB7K0tTgKdi8j3HgxhiOVeAWK5xta5RWRMVkVMfkxgCZCZz9iX9wx7c/QBoFHPXC95GdOA9TWTv8AaWHwOra9eCQwo5reHL2Q1zaq/hcwUE38TmgkRLolPStC9ngG26LFSfZhsn39pOdkSNNR2YFthxHtJ/aTW5JO8md/ViLCjDRY+cI/X4xVNSWp4bi3hAW5sAzBqdYeczHedUaT864LGvX5KKIYupyZ2DRbUEpBd9ABGSBX02HgobEQF6l5ILwkKs4KEGdEELshyiCM888ixUr/kOtVmPJkqWkv9+Oc9YUqu9dAbsSpiSr+gmveYDsq2dRbcIRvE4h/MmmgYAOILmzD9MXoTpHbuZ8NPhRG86My9H3Pn+gzbQ8mUR1jujzSJIU03EuatvXht5QOAk2f3LXv7uM1JmOCZvn73cw1ZNkq32+Dl1xxGvbHG6oDs4ieAqemoWgvz5sG8uzM5FxsScdxZP+57soDKndQhCn5NpmQG4CVHsG73zMC6Hyx4EfY10gMLAzVSRKyXEuIMxm+FzJ4or+wdf7qqO6eFk2hsrIBV19RFhdFu5ZE6lGCXEsAd1B0Zrj2qriNQ/Xz8hp4BtT4JzWHKb0yHVAjWuzyfW4ZIeimCre2aE5Vhlum224P1JMtODOAH5bAksZ3vSwYlNcP1P7vSmKSGvUIbbsKkGdEELshzhO0dpm6dITAIPWNkyKMX3hQEC3W7KyHzemKQ/gp0DaN3ywY0oxaoLbNLUMAaI4peY9lcziO6Dnh5A9irTwdIqVVg54eW4PcZwS55+EPfmt6K1fBMBMfhOm8HSIy+DOJskdS391ZP/dZqS1opZk2NKjCUKYPdmQtWtDzk1VKgFPb7P50iT4Up9iggUfnWjIR42zqpUkKQm7amjtCjBLpo3Ca29C/+ndqJ33Y455Keqwo+D+8+r3cyZSnHABf34QfjAlJRtF+1xZsG1NIbagHNdXx1ts+lRyQEl8GlloWXysd+g2gEt7FM+fbTPSVcqSJMX3x+ffsVFULc3btg0mPUmBt21VnDJb82jFIyoZjyet1+ye97rR1/xphuGS7Yqbd1WeeWXB8KXJ8OzN9YAOoD+FF25R3DXTIxM+9lqdZQ2UuG16EtQJIcR+SlNDOLDSEpE9YQKqL6oHb3uME/TCVmKLphy/12xD5uwpBJ/bo3B23kbNGrmtUKOpEmTw9XzsjktJ05S4nHIwXpj+So7cxEvJT38XcWII4izVkouTe1U9g2hl///dFiycREFqSBxFuYnPbAZpluddovnXmvogfuYk+Nvnsnhq6NYsp7/CBRmHM6c4OEC+Fhy0REgjJY4NfXoK2bO/gk4DEqdAxulDzfkcqbLpazubj/ZN5kdTUpZZMXGw7+CikNr4V6wmvb9essVa0Er7+xbRS+MEtSPFsjQpDMtxWU2b8vIpqG+HLO711u5LH/3oouNY/K6s2Hsjw+d2KpbnGAjqrikqPjjRsGKvfCjltF67MvMo/0ZYyBMpxZYYpjvgpSmq+Mgrh82gCeeNhRCiMRWtBNPhkLloIWTraQD17Dzea+dSU805GxzGCea4dry3LcA6ooD91InkPrmEsm7sAfWjqQfjBz/ddTVwwZvGzkpb/f+pJ9Y4kOx9BWVjvrWR6utuo/qGfxFdeTcF05zzs7atuWm1GgjoADZuhS/9UmE7w59TUIvIl6q4pWrDB3S7pamhEnuU0gLVAHrL7RSzL6OSvYA07uT9rTFPS3zs6r6z9GWzDvHftw8EdADJvSWSf/WSyTTn6/5IkozLajfLvwPFmfmho/n/ajU443RlcryzgRP3WmJdnoVHqx+epoZp9vAwvtuCvr0++jsTxfK9fv8EC/KPUtckbsvzs4pi3jrFSRsVC9cp7oo1hcJIrwWPrvF1RRBCiDGUJCk7SckuLZD9/LEQGxJbUbQS0hEqmjsWSibGObYV9+gCqQX9pE39fJpFa4uNraoESQvVaojWCrW5NlgwGUjXlon/uBXv7IkEcXMEOrtprVj30PD2+x5UpGmTZeF5AnYHpIoEGx41mYOlNemG4asHyfoK1lMnPMajm8s67fC0TZouC342TbHUg9tqcHre8PICeMXqIXZCanxo82v8YGqG92+Hf/iK5VnD5ROhrVp7lLVmwwlZONqDlbvmOzo1vLXTsHzD4LVhd2KUj3XD/2yFW2owz4HvT0nJ+MEjru6GSnHx9sETd76B1z6s+PMM61G3hDY6CeqEEGKE+XGMD4NX2OYaa+9TFKdECkn3PQps26I9txO2fh1VuQNrwnPJtZ1Ov99Oum54xjhzXwl95sQx6On+0VpRyBbR8Q6es/wI3vdVPaSu3evPTVHj4UMzAoIwouWUbqI/PTyk3Tm5m1LQfNufH4mfwif76pvHdiRw2kZ4ditcMsFwrIqp9dXk0tOkwjCmNS3zia48FaAFg1Ou8EjH3ZRjsdnL8JHtms9OglpqqKQw1VGUEsXr2+FnJZjlwIe74NLtYCv41lQIU6gawxQNRj3yxFAtZdjWzgci0E0+lyRBnRBCCNFA2rI7UGvOh/KtAKjeX2CmvZf8pIthSfuw+1vLuwgtmibgbs8Vse4+Hfy7mdL5Dq7/5Ad479fa8UN454tSFs2KR67AdJOLohQz1SPzlsPqWWi1wn3hTNIuZ6CcynhgARO1YXedlAj4SQnOzxtiMzJnB7WGnOsCBv8At0GLJyaOQfdXaN3986Pct5bNcOJ6TcXA90vw7SmKX5TrgZyj4NtT4GUFuC803OgrLptYD2aetgG2JgCKjFLcMcujK072mVE3q+pB4YY95kXOaTFYTb4BRYI6IYQQooEoEwwEdANtD38ee/J/U23pwnv7QsJrHsD4Cc45U2FJO1HUHKs2rmuhdnwX/LsBaOn9JE9t+Su/uOxHBGomOSckbrJtpAdbn4nJndhB5ph2UBBkFNVxtEoH4Gq4uMPw/ZIaSKpxmANPyRrivbNs7IecZZGpQfijjRCltJ03jbDVppyOn8B4PNBacVcIlT2CKwPcUav/92iv3rZ4PeyeADg3D+e27A7o6moGPrFT8bG8TbKPz8rEJOba6TZv3gp3Booz8oaPToRCUKnvsmlSEtQJIcQY62wpoVX94IDBoa/a1pApll3Xpr+WoVSFXAZcO8WlubOFNaZ9pBDQ9ZMeVZPgLmkl8/FjUAoCy1BLmmeAr5VB+6uGNpZvp9P/KqWOSwnDBnzjN4BqLaK6O7XdvnOqNL12v8aqWRmur9bPSi3zDJmyPyIL0JlIUbno9vq+OyC6fhv5q5eiW/W4LQ3RjIwxTNrr8vfbMrymzfD+HYrntcAX+4be3p9CYAZXeXcLDJhH2E5ZLteYXXD5xmSXGENOGbJ+Bb/JP1uS/VIIIcZQV6GI3vRe1O3zULfPRT3wZjrypbHu1jBaa3oqGU55q2LxKxXzX6T43+9qAtPMx8obk1EuZsL5Q9tmfojAdAD1jKRFFdNPTK0Ro/9HEcWGtPs1w9rNhAuaJqOlODhMnJApVjgPn5MTH7dUHZFSHbmcQ3Tj9oGADoDEEP5qMzlX1jYaiTEwkYQXtAy+7n+swAsLiq9NNky2wd0rULvJh7NaFG17RDQWcHEn6EdJ6FUshmT6y7T0l9F9FYImD+hAgjohhBgzmYwN5X+htn+L3VWYVO8vYeevaWnxxrZze4nxeP9XYMMe+Ro+/3PFzrJGyzfJiOoptcGcqzBH/BYz4/9hjrmdtO1sSuXHfmyjSxJD5ByBOezbkD0CcotJF/6KwEzZ59kXcWC0VsQ5j1prnlpLFst+tETyjSGO0xE972YMKHcfFynX2ntxRzQAp1LjU50xK2YbfjAVfj0dXvcwfLdYz5z54S4zZC/DHBd6E7h2BlzUAa9rgz/PhG/1K+5xPFQTvOdHikxRCCHEGHEcG3bcOKxdFW/E7njZGPRoKMtS+LGHMZo4Vty9YfgIaO2DMKPTwpgEGZOPnB3FNjzvVJyJp+P70bhJ6tCS8XEq16OKf4OZH8RkDidUs6hUZWZgpGQyDr0VBwW0tCtesUXxp6pipq355mSLo3QN9UipB8ch34/IndRF+JNNmL5dW5UzGve8afQF46+A+3iQqYWssyzevl3x0K636v9OhB8UFZtjuGEm/LliKFiK4zPwoodge1I/W3fJBDh1Y/2M3beLmttneGRKh8YxAbmKCiHEGAmCCDrOHtZuOs4hjMf28mxZmh3lHBd8yGHBSy3+c5/ivKcMjdpOOw6OPxz+tTbDgzvzRCbLo2SRFk9QFCXEUYBtm3Hxd7UshZfci77vAtj6FbjnhagVx+Ck69HNnku8QYTk+P1tHs//gMV/XWrxj1sVFzr1v+3GGM7ZrAgyjbULYDSUnYT8VUvxXjcX7xWzyX/uOPysQY7TNaY0NUxzGAjo2jUcn4HLeuAr/fDMB+FZrYpry/D8zbAthnIKvy/Dg/Fg0pTtCdQOoeVYWakTQogxEoYJpvUImH4J6qFPgkkwk18PhVOoFiO0VmScCEVCkGSJ49EbgVQjj2e9R3P/5vrPr/iI4s5vQm/J8L3rFMceBp96i+GYVyn6y/UvzWedpPji/2RxVDPnD2sMOS8gY9ahH/4Mxp6EmfJWiv4E4iYu+u7YGrX168Pa1Y7vY3deRngIrR4dDLatufdBzQX/b3AQ+5z3KW79JoCIyAAAIABJREFUKhztwsqwnhVwawKzxrCfYyGIUgJSsk/vAqDoR+Opbvu4Y4xhKinPb1H8tKyY7cCqPc68FVP4aRE+Pbne3qFhfQQO8NU+WOBCJa0nS8k8Ygny8UeCOiGEGEM9pQLt3RdhT3otKEhMhp3FLK6d0upuQW36ICrZSWbKO6hlllCpZUelX7VoMKADKFfh+NfByv8zvOsC0Npw0WcU/Xuc8/rNPxWXvlIza4KS81EHwLI02fQu9OqTgV3Hfrb/H4XFK+gtF8a0bwciNWCyRw6fN88uGpGEGIc613X45u+Gt3/vT/CKF8HF2+sJJLoPnSNGw/h+82SKPdS5FZ/PdGb40ASLcgottmL3IcjJFpzTAk/ZUC9WD/D8FsNnumGBp1gdQIcFiz1DvhpwqKRgkqBOCCHGWF/ZBdwhbS2ZPvR/joO0fhZA9/2RzKLrqFlPHpXzVZ5jKOShWNmjT9l6MeSs5RMkGTZtc4Y97qEdioltObRKyTnBuDkLNpoyjo/e+JGhjXEPlP6JZT+zaf+mYZhA10tg29fBX1NvzC8lbTuDuHyoDLsOppT504e3zp8Gd0f1gstf7Da446zG3UjIZh0yVYMyEOY1ZV/O2jUCp1JjItCtFFHG5QvdDu/eoXh5G3xqp2FHMjhFdIOvKBvDWZtg565L5AkZ+PlkDzeSM3VCCCHGgG1rVP/1AwHdbnrLp/Ds2qj0IWOHfPO9KZldsWZrDr79gZS8Ux/s5L2IVz9z6OpKPguTOhTzLtA842KbPj8rZ6X2gzEao4eXilD7aGsmOWUR/KRGkvkN6cybSWfdStL+cyK/fay7Ni5UqxEvfQYsmDHYdvRcOO8phre2p9wzK+VsXcOS5CBDtGmHzFqf2kfX4H9oNfrvvXSq4RNWYuwYY7D9gPPtGqtnJlzYZtgQDf1ueWErfLRnMKADuK2mWB0pLOvQ+B6SlTohhGgwxgB21/B2pxuzr8LUB0Mac9IRsOY7HmW/HtR5Vo1kV120OE549vKEJLX46q8VU7sM73qx4sP/B2kK92yEC69SfPM9LvZ4rZZ8kNQil+z0y1A7fwNm1wA8M480v5Sk3JyrdABuqvB/8iDRDw3kLUiB2gayX+uW1PIjJG9X+dNVOTZuBcuCaV2Qs6u4peZ93xxMlgVWJaVy2Up2VzkPPn8f+gNHkj0yL9s1G4wJYjJBjO1YvKKQ5bba4IVjogX/jIZfSB6KQSkFh8DZOgnqhBCiwSRJStpyPDp3JFRX1xutVtJp76NWtRmtLydlYjI6JpPf1bDXuNA2Pi8+1ea8k2yCyOaZ72bIObzb1ijixOIQKhM0ItLUUE5mkF9yN6rnp+BMwrSdTtEv0PQDEw0kQCUZ/FmMmDhO8VSZhVPrg9s0NcSSEOQRZbMe0Z8fHnZti657GG/hPCTlU2OKo4TntSRUuyy+VlRMsuAFrSnHZhR/rAwGdhkFp+QgLh4akxoS1AkhRAPqr7bSdvifUNUVkPRB63JKtULDJSCJ45iMFYPK89COobOkpyw1ONZjn5WyLEWaGqlzt4cgsgmiiTitb8IYQ1xOafaALrAMzrOnE/1k00CbffYUQm2a/ak1HEk88/hEUUJ26vDkU3pKhtRWIAt1Dcsu+7zGtXnhZAfbpGSqEe2ew4+m2Fzdp+nQhiu6DPnaobNTRII6IYRoQGlq2FluwbKWoxy1a1DfuDwr4JdXZnjVRxWbt8NTF8On3mywVfiIwVrGjci5JVRlJSYzl4guSv7oZPdsFlE0fhKI+GlC6zmTyRzTTnLHTqxj2jAzcxSNLCWJsREEMS0LC1iHF0jWFAFQ3R7uudPokWQpDS8JY7K7SqEkgF0NOMOJOanLxkoNTjU6pCY4JKgTQogGliSGZljGUCZm6Vyff3zexRiNrRM8K3jEL1TH1uST29H/Pht2JZxWU99NvusdVGqZUey5GE0lE2PNdLDmTCFJUpJEAjoxtvp0RPu7D8f0RZgwRU/0qGQMjE5OKrGXlhaXHZYLqn6+fJKJKBYf/2pbFCW4uybDGnsqdORJUCeEGDNaKxwHwKCUIk0VYXioXYbHjyRJyOjBUyjpo7yUObcPvea/YY8KQvqhj+NNvpDU02TtKoqIVOUoB4VxtWL1RLhu/UBiGI6f558kRoI50TCSxNBDhNUJYJEkkQR0Y6RQ8HhQObzmYcXfq3B8Br45xWF2i6JclhflsUhQJ4QYM56n+NvfbqSnpwelFAsXLmTBgsMJgsZfmRIHRisg3LJXa4oiJVf6HnrDO8EE6PwxtC38KaHdSnE/tmZaliLjVLFURJC0EYTN8d5ybENrZidqx/cBMF0vplRrJ4olTaQQB0M9sW9zXB/Gq63K4UUPKf61K367pQbnPqi4foaNbMx/bJJ3SggxJhxHc++999LT0wPU69CsWbOGMKztSj8sRpLjWHie3TB144I4i5n40qGN2SNQxOj1bwWza7tNZQVq4wdwwn+TcZ/YKq7rGNrc9eQ2vATv3jNoKX2ZtnxzFKFtzfRirTgavfG96I3vxVpxNK2ZnWPdLSGEOGgSGAjodrs/ApnnfXwkqBNCjAmlDD09O4a19/buPGQKhY4GrRWdLSUK1Wto3f4uOtx7ac2NfTawauCQTPsQ6fRLIXc0ZuLLSBb9EeM/MPzOlX+jo4fIOOUn9G+0eEWsVSdC/3Xg343ecDF27/dw3cZ+f7mujdr2DUhKg41JEbXtG7iubLARQoxPGpix1yWuQ0ODX7Ibhnw7CCHGhDGaGTNmMKkrz/Sp7VSrNVau2Up390TiWM7VjZRCtoR19xng3wWAevhzeAt/il04mb6SO6ZlBHaW83jt78DtfD2JyeKXXTryC6h/te/xHiicDJXV0Hrm4/7dWqv6c06GBoJ6+zfx2l9ASGFknsQIybgxWaeCIiYhCzo3/E5m/JyrE0KIvXWT8u2pmvMeVBRTyCm4Zoohb1LZGPs4yEqdEGJMRFHCgtntHJX/Kp3rljG97xU848Q2HFtJvbIRopRCRw8OBHQD7ZuvxCr/g7w39qV1g1BRqhWoBg7GGPywBbPwJ+B0Awo6zoHuV2K8mfhRy+P+vcYYcCYPb/dmYXBH8BkcuLxXI1/9EfaKhVh3zMFd+1zUxBeDM2nwTjoL3a8miiTBiBBifCr3VznKSVk1x7B6juHuuYYnewZTbI5t82NNVuqEaDC2rbHtelRjDETR+Cwkm/US1OYrUNuvqTdUV2HdfTqZJfdRbbBVlHHHGFS4CTe7BBrs+LkfuqTu6bQsvg2V+uDfh0kNYduLCPzHPw9pDMS6Gz3hfFTPj+uNVhvMuJSMExDEeaIoQSmF4+hdGRnHZoXYs4vodf892FC6CR68gnTxv1APXQWAmfwmSuEEmfAQQoxraV+VDCCFbZ44CeqEaCCOo+jr28Gtt95CGIZ0d3dz0klPoVYz424wl7HLqJ2/GtqY+uhwHUotra+0iANijCF1pmFljwD/7sEbpl4EW78JbeeNXeceRRBCEHZg2xOwnJnEcULiP/H3Q7Gap3P2p9FT344KN4A7A7XhPRBspHDk9dR0Bk9vRfX+CnKLSfLH0V9tGdXPWn2b6D3D2lXxb4SxIphwOQBhNRl31wAhhBAjR4I6IRqIZcE//vH3gYBm27ZtrFjxH446asm4q9+mFJBdBMHGoTe4kzFNkna+GRT9Ah2LrkP1fB9Vuw86nwOlf2KyRxLEjbVKt7c4Tg/4fKUiQt37wvp5tHDzHu0JmeBG9L3/NdjWdiatc66hWM0f0L/5RKSpgfwRgGLPdOqm7TSiJE8wjurTCSGEOHjkTJ0QDUIpRaVSHrZCtW3bNoYkjRgn4tSBmR8Ed+quFg3T3oORuaYRlaaGnlIrUeerSSdfRFq+k6TlqURTPkAl8Ma6ewefMaC8IQHdrhvQG98zpEX1X4tNidFWi1tJ538LrPZ6z9rOIJ36XmqRNep9EUII0Zxk9CREgzDGkM+3oJQaEth1dXWx9yz+eOCHLTgqRs//JugMaA9TWUWUNvbqUbPqL9toPQO78BaSJCWppti2TZhoPDshisbXipDWipxbwagCZs6nUHefx+7JETPxlRh0/czeXpSJRrmnUA08Eu9ccotPQWGIEpdyJS9bkIUQQjxuslInRANJEjjxxBNxHAeACRMmsGTJUqLRH2cedFGcUlULSdxZpKU7SKIiceGZlPzR2/p2qElTQxjGGGMIyXP1zzK85mMev7klQ6zGTzCtlKI910dm48uxbu9EJVXMsfdg5n8Ds/hm0hkfphx0kE65eOgDc0cRq/Yx6XMQaXZW2uittFOq5SSgE0II8YTISp0Q+8l1FVrvSp2OIgx5XAMxrRVaq32eFYoiQ2fnJJ75zHOe8O9tRn7oUYumYre8jjRNSSrj83k2mlqS4Vnv0dx5f/3n39+iueQVijc/x8EkzT+DkPNq6PUXofr+WG+49wUoZzrxMSvoL3uktfpnz+u4ACczF73jW6S5pZiJr6JULTDeVsWbidYKnHpNKhXpcZn5VwghDgYJ6oTYD5mMZsWKO1i/fj0AEydO5ClPWY7/KBn6lALP01QqZcrlMt3d3SSJIo6HPiaK0j1W5sb/gKZetmF8bf1rdNVgMKDb7XM/U7zmHIeMbv6gztEBqv+6oY3Rg6jwQdJ0zkBTsZrDtk/DmXoyibEJyynN9pnzbItMrFBKUbNSanHzfpaUY+h1e/j8lm/hpwFvnPxSpprJqECGKkII8VjkSinEE2RZmmKxfyCgA9i+fTsbNmxg2rTZjxiguK7iX/+6lc2b6wkblFKcfvoZuG7+gDP8CfFE2PvIv9Gag/GSMz8xNk5+CRRvHGxUNtgdw+5bz7CpacZkRC3KQt/UR/DdDZg4xX32NDJnTqYvbc7AvOqVOWvVK6mZAIDf7vwLv190DVP1VFmxE0KIxyBn6oR4gixL0dPTM6x9x47tKPXIA48kiQcCOqhvqfz3v29Ha4PnKVzXkMkoXFcdlH4LsZtnJ7zg6UPfqx97Y0qLF45Rj0ZWJciTzP0qeDPrDTpLOu/r+GFubDs2grRWOD0xwRfXYooRVBPC72+ENSUyXvPN1zqOxW93/mUgoAMwGL629QcoZww7JoQQTaL5rvxCjLE4Tpk6dSp33rliSPvMmbNI06EBmeNotK4XDg/DeNjvCoIAy4Ibb7yR3t5eAObPn8+iRUcTBM23ciCag6NqfOKNWd5wrmbFOsUZxxk68hFxE2/d21OaGorBVFoW/RNtqqAyVKM8tXD8RAe2rYlv3T6sPfrHDrwjW6mNQZ8OVJvVCsDh2Xk4ymZ19T7arQJqHGb/FUKIkSZBnRBPUJoaXNdj2bJlrFixgiRJWLhwIV1dE6nVBgfF2azm7rvvYv369eRyOZ72tKeRy+WoVqsD91mwYCHbt28fCOgA1q5dy5w5c7HtHJal0VoRRYlsPxIjylU+S+ZqFszI4oeKIHExlsJRwWM/uAnEiaGvUgAKY92VgyJJDNZhrcPardn5XUFQc4mihNPbl/MP8328+yJUaEiOzGK1ZUgqMsElhBCPRYI6IfZDGBq6u6dx1ln1wtlpqqjVBgcejqNZu/Y+7rnnHqC+Ive3v/2NZzzjGaxatZpSqcicOXOZMmUKa9bcPez379y5k5kzW9m4cSOlUpG5c+fiOB5RJIGdGBlaW5TCLK+6UvGXO+ptL3mGw0dfr3EYXr9NNJYkSdHzW7CfPIH45vp2cGtRAfvJE6imzbni2lHOkf3AWszW+jqjarXJXr2UPj18l4MQQoihJKgTYj89WpZKpVI2btw4pK23t5discTixYvp6dnB2rVrWb16FccddxxRFLF27dqB+06ePIkbbvjrwNm9e+65h1NOOZV8vl2SqogRkeoM3/7dYEAH8N0/Kl5yusUJh2mSRN5nja5fx7S9fh7eS2ZBbDAK4haLWhOWpXBdi/i2noGADsCUYsJfP4T3/KnUIgnshBDi0UiiFCEOCkVbW9uw1tbWVtatu58bbriBzZs3UyqVuOGGG1i4cCG5XI758+dz+umnAwxLxrJq1Uq0lpU6ceCUgmIVVqwdftvNd9WTAYnGlySGXiIqbZpau0W5U1NswoAO6mcEo77hJwFNf4htyVBFCCEei1wphRhBSilct/6xOu6448lmswO3HX744SgFmzZtGvIYYwz9/f2cccYZOI7Dtm3bsCyLrq6uYfcTYiQYAxkXTjtu+G1nPxlZDW4yQZxQNQlRE79ucWxQJ3WAvceEggJz7kQi07zPSwghRotsvxRihFiWwrYNK1feSblcYu7ceZx11lmEYUiapmzZsoU1a9bQ1tY2bBWuUChw7bXXUqvVZ6rXrFnDaaedxh/+8IeBYO7oo4/elV1TgrtGZdv1gL4ZgiJNzJL5Dhedr/jKr8Bz4LJXGWZOjCUpjxh1YRhjtSmST84j++Od6NBQfW477iSXIJCtl0II8VgkqBNihLiu4g9/+MNAYLZ9+3aOO+44QHH77f/adR+XU089Fd/3WbBgAZ7n4TgOxhiCYDDrYBiG9PT0sHz5crZu3crcufNwHI8wbPxg4VBkW4pCth9Vqhe7Nq1Po+i3ESeNFxwpBS2ZKo7uoTtvOPqVLbz1+V1oDXk3JI3HR6060YQCh3RqyubXZ7DRFLI54qpFMxaGF0KI0SZBnRAjQClFpVIZCOh2u++++1i8ePHAz2EYcvvtt7N8+XKuv/56isUiAAsXLuT444/ntttuG/L47u7JdHZ2sn37Djo6OnBdmzBsvEDhUFfI9mGtPB6irfUGZxKFo/9Fb3n4ucqx1parYG94E6r3ZwBkCk9l+vwfsbOcJ5UFETGGksSgSy4Trcn1n0spiQR0QgjxuEhQJ8QIMMbguu6wds/zcF0XpRTd3d04jkOhUGDlypUDAR3Us1uedtppOI5DFEV4nsfUqdO45567WbVq1cD9li1bRnf3NKJIBjqNwnUt1I7vDAZ0ANFW1I7v4BbeShg2Tnp5y9JY1dsGAjoAVfwbuvfHOPnXEEWN01dx6JLMq0II8cRJohQhRohl2UybNm3gZ601xx57LB0dHTzzmc9kypQpFAoF5s+fz86dO4c9PgxDjjrqKBYvXszZZ5+NUrB69eoh97njjjuwLFmpaxSOY5F1Q1S0Y9htKtqOUo2VRdKyNJRvGdauy//E0hLQCTHStIyyhBCjRFbqhBghYWg4/vhlLF0ao7VCa02a1ut9/fWvf6VSqQBQrVaZPn06vb29A49VSlEoFNi4cSNJkrB69WrOOeecYRkv41j2xzWKlqyP5/8dvfnLMPNDsOVqYHdgZGEmvY7Qb6xAKY4TTMd5sPmKIe3phAuIE42cXRJiZLiuxrbr1/tMJgNoajX5fAkhDh4J6oQYIcbUk1DcffddrFu3Dtd1OfbYY2lrax8I6AA2btzIM57xDHzfZ926dWSzWU444QTuueceNmzYMHC/3t5euru72bZt20DbrFmzJANmA7AsjReuQq9/G4SbwZ0KR/4B8/BXADDT3kcl7m64MhRpagj1bLw5X0RvugRMRDr1YqLsk4irMuAUjcmzLbxUgaXxSRp+m7Dn2YRhhd///rqBibjFixczZ848CeyEEAeNBHVCjBDH0WzYsJ77778fgCAIuOmmmzjnnHMGzsoBpGnK6tWrmTt3LjNmzCAMQ2zbHhL4AaxatYpTT3069913H9u3b2PKlKnMnj2HWq2xBzSHgpaoRLqjhWjHx7AXzkFXPo5a+zrM/G8QOEup1uyGLQtQrmWJci8me/S5ANTiFmpV2SMmGlPBcrB3xiTryqQbKrSc2EXS7VI0jbtrQamEW2+9dcjOipUrVzJ37twx7JUQYryToE6IEWPYvHnzsNaenh66urrYsmXLQNu8efO444476O/vByCfz3PCCScMuc+sWbN23fcw5syZhzEKv8G28x2KcpFP7eOfwr/68/UGz6Pzz9/FyT8AcQk/dUjTxp6NDyJFEBXGuhsjxrY1OacfrQxBksUPnLHukhgBrm1hbQsJfrSR+Nb6dvXwF5vJ/Pc8vJM6COLGvB4qBaVSaUibMWbXxJ4Mu4QQB4dcXYQYMYqurq4h2yUBOjsnMGXKFCqVCqVSifb2dh544IGBgA6gVqvR0dHBggUL6O/vZ+bMmeTzef785+tYsmQphUKHZLxsEE7Np7w7oAMIAopv+TDtP3w3JncCSUVep9HkOgmtajX6vjdAsAG76wIy0y5jZzk/1l0TB8iJQFlqIKDbLfjeRjLLOgke4XFjTzNjxgzWrVs30LI7E3Icy/VBCHFwSFAnxAiJopQFCxaybds2duzYgVKKY489DsvS3HTTTRSLRaZNm8aUKVMGAr/Zs2czc+ZMPM/DGM2iRUdTrZZYuXLlwKrdjTfewLOedS5KaRrsiNYhyZRLw9qSTZsxuSfT77ci5x1HV4tXRv/nVEjrNSLV1i+hrRYyEz5ALZBtpc0stQE9PIOsafCSB0GQsGTJErTWbN68mUKhwAknnEAcN1Y2XCHE+CJBnRAjKAgMJ520HDAopVAKrr32WnzfB+rFyJMk4eSTT6a/v5/e3l5uu+02lFIcffRipkyZMiSg2+3hh7cwefLMhk8QcChQnZ3oyZNIHx6sS5d52YvwvULDnqMbr2zbQgdrBwK63XTvz3AnXkSNxiv+Lh6/WpqSbbGxjmojWTW4s8F9znQCywwmm20waQq+n3DUUYs58sgjAU0cI9dvIcRBJdOYQoygNDUEgSEIoFYzRFE8ENDttmnTJqA+Y/uf//wH3/epVqvccsvN+H6V7u5Jw35vW1u7FORtEOVcgbYb/4j3nHOxDl9I7tL3kHnn/1Az1lh37ZDiOIY2+16UlR12m8kuIjHeGPRKjCRjDCU3IXPx4XhvmId9cjfZDyyC07upJY0dIO0O7Go1qNVS2XYphDjoZKVOiIPItm2UUkNS27e0tKCUZv369cPuv3HjRhYtOpL16wfP3M2cOZOWllZJhd0gksRQ6pyE98XP4oUhUb6F/lS2VY22Vq+EXnEyTH8fTH0bPPRpwIA7hXTWVfi1LLIVtvlFUUovIe5TO9BP7SRIU6nXKYQQ+yBBnRAHUZLU6xOtWLECqAd5y5YtI0nqCVT2PEgP0NnZSRimnHzyqaRpgtYKKVo7thzHQmtFGCYDwXmaGnzLg6wn9brHgNYKnWyHqW8HdxpkF0HXiyH1Sb05FKsdshV2nAnDxl6ZE0KIsSZBnRAHURQZZs6cw6xZs/F9n3w+TxQZarWI6dOns379A+zYsQOAyZMnM3Fi9x5lC3bvjpbB6ViwbQs/V6YvqeIqB8dzyERZEv+xHysOrpZsACoH5dtg2zeg/QzofhVs/iTBjK8QJ/KZeSLUroVmScQkhBDNS4I6IQ6yMKyPlCwri+8PLuvUavWkKsakuwZVsiLXKJQCP1fiow9+iVdO+i9+uuMPbAoe4gVdz2RZfimqInXQxpKrK6hVZ0Kwvt6w9esQFzFzPoXv55GJkMdHa0WLsdDFGAykBZuyTmSVUwghmpAEdUKMkr1nwY2pJ1TZnTRFBqKNQ3mGqx/6Js/vOosL117CprCejfTP/f/k8pnv4DnZs4lDCcDHTOoPBnS79f4c5nySRFbpHreCsQkuv4v0/jIAenaewmVH0kc0xj0TQgjxREn2SyHEuGBZmkxGk81qslmF5+3/nFWga6yq3ENi0oGAbrevbf0hFatyoN0VB8CoHKi9Xl9vNkkiCWseL8exSG7fORDQAaTrKyQ39eA44y+Tq9aKTM6GXEyUSqKV0ZbLuQd0TRZCPDb5hAkhmp7WCs+Dm2++mS1btpDP5znxxBPJZlsJ92NFzU0yHNd6NBntDrutxcqhkeBhLPlRltzMj6A2vLPeoDzMvK9RDjqQzDWPj9YKs3H45ES6oYI+uXMMenTwmEyE74Z8a8dv+Xvxdpa1HMMrJj4fp5KRraYHmcpnKNk2P6nARAue3A4tFV9q9glxEMhKnRCjSCmF42gcBxxHPn4jxfM0t99++0DR9kqlwl/+8hfsR5i20lqRi3xa/SLZNESpvYK0UPHGyS+llgYsLxw/+Dg075/xZrJR7mA9FfE4VAMPv/BykqX3kx55I8nStZRYTCS1wB63KEqwT+ke1u48fdK4GnDbtmaH7uXKTV/gE5u/xs2lO/jMlmt40wOXEnq1x/4FYr85jsXD2ubIBxQXblWc/5DirE2Kcn54bUkhxIGTlTohRollaWw7ZeXKOykW+5k5cxazZs0akjxF7K+UrVu3DmlJkoRqtQpkhrRblqKlZyuVC99GdMeduGecStvHr6Q/UxgoWWAMZKstzM/O5mOz38P9tQ2sqa7j1PYn02k6iavymo21SpClEmRRqhsTyWrLE5WmhrDdxnvPEUTf2wAGnBfOIJrokKbjJ6ir2T6BCfhN7/VD2m8u3UFkhdhIkfqDJch6fGgH+Ht8PFeHcEcNTvYsgmD8vM+EaAQS1AkxShwHrrvuOiqV+pannp4egqDGvHkL92uLoBhkjKKjo4OHH354oE0pRTabxfeHDvjz1RLFM59N8sB6AIIf/ATTXyT79S9TtQcDwDhOyPutBJka87xZLM4egakpCMffeaNmZiQP/36rkuAsypG5bBEAvmuaapXOsjTGmEfdQqlQRCYmb2UpJoPnB21loZHP8sGUKEXfPs657kxA2xponveaEM1A9n8JMUqiKBwI6Ha7//77UUoGpQcqjmHZsmW0tLQAYFkWJ5xwAmk6fECh/OpAQLdb+Ic/4STDM/6lMTjlDLlyAVO0JaAT404Up5RUQkklRFFzTC45lqYDm/yaKq1bEwrKZu8d1Lu5UYaCbuV/pr16SPuFk1+GGw0/MytGTiEMuahj6PdbQcMpefB9ybAqxEiTlTohRom9jwNe2WxWCv6OgDhO0drh9NNPJ01TtNYYo/Zd98/zIJOB2uB5Gmv2LCRfghCNT2tFvgzVd9wOuz714+X0AAAgAElEQVTf+qg2Wt+xgCLDs1omSUpH0sGZbU/jSYuWsKJyN8e2HEkXXajq2A6BLEtjWYokMSRJcwTUT4TvRywpuPxhOnyuT9GlDe/rgtYg2scrJYQ4UBLUCTFKjFHMnz+ftWvXAvXtgccdd/yu1SSJKA5UGO4eJuz+e+77b1rL5Gn53FWU3/AWSBLIZmm55sv4LW0giTZEg3MsTT6xoBihWmxqlsE3h842tozRhN9+YCCgA0hX9aO2h+hua59bMXVgkw8LtDudzM3NoTWfYfv20mh2ewilIFZZXNWDFTxAzq6hWhZS9AvjLrhzixWeknU4ptvBSUGVfWKpJSnEQSFBnRCjJAwNixYdxWGHLaBUKjFhwgTiuL7KJEZPoGwy55xDx/2nYrbvQE+aRDWTk9dBNDzb1uR3JPgf+A9UE9Dgvnou2ad0jqvAznUtcoFCK0WiDGWdDgQ7KgWzMxz2GNMfoSbZPNJkjjGDEz+trZl93me0hGmGJOph8vYzobKi3uhOo/3om+kptY1p3w6GJDHkaiFRlJDIlgghDho5UyfEKAoCgzEOhUIXtZohjuULbizULJf+XDvFWfPpy7QSSsIE0QSykSL47H31gA4ghfAb68js4+xos8o4Fi09KfFftoKfYIWG9tgik6nPQYeOwX7mlKEPylpY81qaZpXLKItM7cbBgA4g3AwPfwHPrb+WlquIsjWUlw4vudIkLEsTtfr8X/lHXNnzWbZ4D5F6svFSiINFVuqEGGXG0DSDDyFE49Ao0of8oY0pmFoK46T0Vy7ShDdtxTm2k+oHV2G2B+gZOXLvPYK0YBGGCZnFbXhvX0j8+y3Q4eC+dDZlnTRN3XljwEvWDWvXwf0olZC2xPyg97f8duf1HJ6dxzumvpZ8E27NDLJVnr3mdTwUbgPgO9t/wU8WfoEF1mFN91yEaAayUieEEEI0gcgy2E+aMKRNtTuY7Dj6Kq8luKdOwr/yLsz2AIB0UxX/Y2vIB/UVq5KJ8Ze0oC5egHn9HIo5Q5Q2T5CgVUrY9nzYa4dAOulCYm3xxW3f4aObv8jK6j38uOd3nH/Pm6hlKvv+ZQ3KshR3+fcNBHQABsPVW75B5ARj2DMhxi9ZqRNCCCGagG8S2l41BxxFcmsvekYO98L5lK10/JT88iwIEkzf0JT36foKisFtiFGcEimaZnVuT56uYbmTqS74G7mH3wupTzr1PQTWEVR1lR/v+N2Q+28Kt1A2VTqaaDnWGNB6cLKh3SrQYRewlQw7hThY5NMlhBBCHASOY1Gt1cuZxPGBnyUyBvpVROZlM/EumEWiDRXHoJUaN0FdaBk8V6MKNqY4+DfTM3OYcZIlOE0NaQjaW4o/5ydoFeNHBaJagsr5dDkd9CXFIY/Jam+Mert/0tRweG4eh2Vmc+GUlzLJ7WJLuJ0TW5fihM54ebsK0VDG0Z4NIYQQojHEZPn1LRle8iH46u89InIjkvDCGPDThH4d02s81m3L8ZtbsxTDFmLGNqvjgVBKkVMWOjYkniL7nkWodqd+26QM2XceTqW54prHFAQRZd+jWM0TRfUwJxvmuWLmxdhqcGvmyyc+DydpvkLpnp/jews/zV/7b+HF97yNdzxwBU9f+RI2qc1Ylgw/hRhpslInhBBCjCTtcNUPLa76YT2I+9U/NL+7yfDdSzxcVXuMBz8+oclxxbc1X/5V/d/QGn5xpc1Tj3QIgugxHt1YtFa0pRbBNx4gXNWPPqwV703zyX1yab12pK2puMketSjHrzhOWeAext+P+jErq/cw25tOp2rHqjZfUJckKZXY55e9fxpoq5mAyzZ9ii/OvBLbb77nJEQjk6kSIYQQYgQFicMXfzF0Ve5vdyqCaORKZ0TJYEAHkKbw9s8qdjbh4D+XaoKr7iX5+w5MX0RyWy+1S1cRK0Ovk9CrIoKoCQ/P7ScVajLlFp6sTqC7Ngmr2pxLlEpBMRle5H1b1EOqZAOmECNNgjohhBBiJBnI7jUOVwqsESyHWBtef5utO2nKmma2USSr+oe0pRur6Gh8nKHbX0mSYpr4T2AMTHUm0Wm3D2l/UdezyMbNk/RFiGYhQZ0QQggxgrJOyOWvHToaf8VZBtcaue2D+SzMnza07WVnGrJO821RNIDq2GuFMWuB3XwBqhjK87P88vCvcE7HqRyZW8Cl09/KCzvOJdnHpIQQ4sDImTohhBBiBKVJzHknRiz7qs11t2uOW5By+MwUm5Grz9Xq+lx7VZYPXwOrH1Cct9zwyrOBZGTO7I0m3zZkLzqM2uV3QWxAg3fhfHwrbcqSBaPNtjWWpYmimEYr15ckhja/g8snvZNIReTSPHGlwTopxDghQZ0QQggxwmwVMKMz5J0XtNDTUyVNR3YfXRQltDplPvr6DH6oacvGhGFzLn+ESYKalSH3leNJe0N0h4uvDUEq564ejdYQtvisqW3gvup6Tm5bRkvagm6wc5VpalCBjYtNvFeUblmaJtwxLERDkqBOCCGEGCGeZxGlHlozsGo20gHdbvVVmRpZC5o0nhsQmJSAFDVBY0zMOClJd1AFOZ/3bfg4f+r7OwAazTULPs6yzLHUao2dAdWyNH6c5eY76+/j5ceAZ2mSRFbxhNhfEtQJIYQQIyAmxwMPaq76EWRdeNdLctjNmbhwzDRzYpDR5lMbCOgAUlKu2PR5vjX/k3jkx7Bnj60aZznpQs3m7fWfJ3XCLV/KktGVse2YEE1MEqUIIYQQB8h1LTb3aE66UPHjvyi+da3ihNcpduxK6qi1koLLYkTV0uFnNEtJGaMaOzJ2HIuf3KAGAjqArb3wf38A15W1BiH2l3zDCCGEEAfIaI9P/3joSpMfwE//CtrNc/eDOa6/M4uf5lF6BGsbiENWh11gtjd9SNtLJz6H1qRljHr0+Cil2L5zePvDvUrO1wlxAGRKRAghhDhAShna9jGW7izAZd9QA8XI81m4+YsZJhUeOXmK1opakqG/onFsyHsJrq496tZEy1JUowxxqrG0IecEpIkkGhnPcn4rPzr8s3zp4e+x1t/AsyeczimFE4mKY92zRxdFMS8/0/CJHyjiXW9RreG/n22wLEVra4ZaLSKK5P0rxBMhK3VCCCHEE2SUQy3NUwrzBGkWE9d42wugsMdRphnd8PTj4Eu/HFx+qPjwga8rYuM84u/2kxxnXmxz9Cs1h79U88ZP2QTpIxdr1lrRV8tx3vts5r1Is+wNFnesy2KUzNuOZ1GU4BbzvL3rdVw981KelT0Dq9j4hziNgY58wC1fSnnu0wznLTfc9hWYMLWXP/o38r/bvsB91lqiVn+suypEU5ErvhBCCPEExMbll/90ecfnFLUQFs/T/OLKHBNaIlZ80+HX/4S8B6efAGsfVMNW2Lb2KuJE4+5jWtW2bT77U8XdGwbbfvNPxZufpzluniJJhi/XBYnL6z+mWbG2/vO2nfDc9yvu+rZHRjVfMXLxxMRV0Lj4NHbGyz0pEzNjQsIX3uZgDKStRd65/kr+0n8zAF95+AdcMesdPLdwNmFZMmIK8Xjsd1B3++2385GPfIQoimhvb+fKK69k2rRpFItFLr74YjZt2kRnZydXX301EydOHMk+CyGEEGNCa4UfOLz5U4PB2p33w/u/qvjkGw1ZXeaVz3AxxuD7EQtmtNKag1J18Hdc+JyUnBsR7yPeilLNivuHHyxa9YDihMM0ya4tlUo7+LGDMfVzSJM6YdW3AAOeCzf8p/5vZho7CaI4hKWpQRGiAN8EAwHdbp9+6BpOb1/e8Jk8hWgU+7398p3vfCeXX345v/zlLzn33HO5/PLLAbj66qs5/vjj+f3vf8/555/PFVdcMWKdFUIIIcaCQw032o7/wD9oSzfx9ueVh9x+y12KIKonQKlWQ3y/vmrS3QG3fDnl/FMMyxfDdy5JOW1pQhzv+7yQo2NecvrQlQml4KxlZuAxsfH44Q0eR73CYu6LNOu2KC55JZz7bjjqFXD0KyCKoS03wn8EIQ6SdB+FCSMTIwULhXj89iuoC8OQiy66iMMPPxyAhQsXsmXLFgD++te/cu655wLwrGc9ixtvvJEoap4tAUIIIcSeHB2z5Zaf8vPXzOO6S87i16+fw4sP+x3nnzz43XbacYasO3zpzbagK1/lM28J+d4lIWcfX8Oi9oj/VhynPPWYlI+8IWVqFyyYAT/9cEpHPsKYeoDXW7F566cVZb9+PsmxFG+92vBA/WuYWghvvhriVFIJiuaQUS7Htywe0vaGyRfQmjZ2Jk8hGoky5sBKfaZpyhvf+EaOPvr/s3ffcXJW1ePHP/c+ZWZ2tm92Nz0hhZCEYuhIMyBC6CiooFQBRVD4oggoRURpgj8REBAQEBAVKUGlSAell0BIgGAC6WU3m23TnnZ/fwzZZNlQs7uzO3PerxevF3NnZnN3nynPec65527BKaecwuabb86sWbOw7Xxl52677cZdd91FY2Njr0xYCCGE6E/p1Uu5+zsTCXPrGjfEKuuY8JPZ7PSjYeyzPdx4Jgyt671/0/OhpQMU+Wzf+q3eb30Ijrt43e03boHpp8Lqtu4/47nrYPvJvTcnIfrSCq+Jf6x+jNnpd9i/dg+2LNuMoTFZviPEp/WJa+oefPBBLr744m5j48aN45ZbbsHzPM466yyCIOC73/3uBp9vjEHrT58QXL268yPbPPen+voKmpo6Cj0NsR45JgOPHJOBRY5H33ADr1tAB5BrX82mI0Pe+2uErQNs49HU1PO7a2OOydrd7Jqb140pBdMmJlm/0GbhCpg+zfD3J9dFfokYDKs1NDV1LxMV8j4ZiOrrK7Da4ny7+qtEtYbIi8i2BzQhx6lQ5H1SOFor6uo+e5b6E4O6GTNmMGPGjB7jqVSKk046ierqaq699locJ9+euaGhgebmZoYOHUoQBKRSKaqrqz/zxIQQQogBwYpTt+m2rJ73ctfQ8G32Bu0QJwXwsXvI9SalFLUVhpdvNDz6suJ3f4e7nzJcfjJ0pA3/fkkxphFuONMQdzyQrb7EINLZmSv0FIQYtD5398szzjiDMWPGcMEFF3TLxO2+++7cd999fO973+OBBx5g22237Qr4hBBCiMEmsCr40s/u5fXbz6PprWcZutWebPHNn5El2X/RHGBZmpZ0gu9ernnpbdhtS8MzV0PcCXC1zw0/iRFFmshAdSKH58l6diGEKBWfa03d3LlzOeSQQ5gwYULX2rmGhgZuuOEGWltbOeuss1i8eDEVFRVcfvnljBw58lP/bCm/FB9lsB0TrRWWpQmCiI1cujpgDbZjUuzkePQdpcBRPsZPg5skiOxPFc/15jHJRQn2+pHNvMXrxnaaCn/9uY+rPrr5iuhO3icDjxyTgUeOSeH0WfnlhkyZMoV33nlng/dVV1dz3XXXfZ4fK0TRiMUUuVyGVauaqa9vwHVjeF5xBnZClAJjwDMOWFUFK2nMBbpbQAfw3BwII2vdAjwhhBAl6XOXXwohNsxxNPPnz2POnDldY9OmTWPkyLF4XvQxzxSi+GmtcJUGBV4UDYjKjMHCtQ2VSWhPrRsbUQ9ayeeKEEKUus+9+bgQYsMsy/DWW291G5s9ezZay8mrKG0xZVHZDtbNC7FuXURlShH7DN2RS13c9rj57IiYm79dFodbf2ooc7zCTkwIIUTBSaZOiD4QRd2vnIehtKATpU0pRSJtyJz2Knzw9gieaqLsmm3w3H7tNzJ4RQFfnAzv3B6jLQXV5RCzcvL5IoQQQjJ1QvS2KIIRI0Z0GxszZgxRpD7iGUIUP9e1CB5Y3hXQARAY/MdX4ThyffHTUiYgrlM0VqSIqRREwcc+3rYtsmGcXBjHtmXhnRBCFCv5JhWil/m+YbvtdqC+fgErV65k+PDhjBo1hmxWrqaL0mUMkOwZVKikBUiari8ExHj6dZtf3Jq/fnvuURY7bx5gI3uBCSFEsZGgToheZgxkMiEjR27CqFFjMUaTyUhAJ0qb5wUkvzKU4MHlmI58dklVO9i71NMh+6n1Oq0VS1bZfP38dQU53/i55rnrbMY3eNKgRgghiowEdUL0Ed9fW2cmAZ0QAB1ORMWVWxO82gKWxtq6gkywmjiGSMfwicvaul7iOBa3Pdyz5PvWBxUXH2+Ry3182Wax0ZZN2ncBRVtnft9Bea0JIYqJBHVCCCH6RRBGrCHC3r4ay2Ronv1PXrj2FLyOFkZ/8RC2/95VZFUFUWTym32TQYU5lFKEVhl+5BT6Vxg0osgwdZOe45uPo/SydNplwUqXdxYrnnwNMh5cflIZNulCz0wIIXqNNEoRQgjRr4IgxGTbePrSw8m1NWOiiIX/uZu591yOnVmOjUecFK/+4RTuOXYM9x0/gfcevIqYnIR/ar4fcsAXI7Ycv25si3Fw0M4G3y+d6gHL0qzudLngZsWFt0BlEo7bF66/X6Mtua4thCge8okmRC9SChJYuEYRGcg4EX4gGwMLsT7L0qyZ90aP+rflsx4jWT+ahqm70vTeGyx85i4AQj/H63f8nBHbzsCqn1J6mabPKaYz/PPSBE2tCmOgocYQ0xmiEvpISgcJvvRDRVNr/vaVd4Hnw/A6RcaziVmlVYYqipNlaSxLY0xpXbQR3UmmToheohRU4cCN75M5/iW8c2aTXOITU/I2E4ObUvl95npLFEVUjdqsx3jdhG1oXzqPOXdfTuT1zMqtmvMfLEveT59WFBlc0oysSTGqNoVLuuQC4uY2ugK6te5+CnafZog5cvIrBj/HuLS9o3n20jQLHgiJmThayxZKpUi+HYXoJXFl4f9lEeF/myECszxL5vzZlBnZG0oMTlorqpOd1LrvUuPMoTrZgWVt/MmCMaATtWx97MVoxwWgbuI2bDrjBOY/fjtOspLyYeN7PK9h6i6EYQmlmXqJMaXXFERrRVCWo7K85y8+phFGNYAy0nVVDG6OZbHgkRx3n9DMnPvTPH5RK/ef1oIVuIWemigAKb8UopfYAfivrOk+6BuipiyqwSq5kyox+FWVdWDPOwA6XwHAio+jasrTtHRWbvTPNnYZjVN24ZDr55JpXUX70nd56uJvYqKQqV89Aytezphdv87C/9yFZbts/vWzideNIVdimSbx+XjJDN9d8FP2iu3HSYfuw7V/jwNQFoer/w8qYxl8ienEYJezefGm7qnoFW94hBkgWZgpicKRoE6IXqLdEGtsgqB5vY19FVi1LsZImY8Y2CxLUR7rwFJZDA7ZsBLd/lRXQAdAdgGq6Sbcqh/heRuXMdN+G09d8g3cZDVfPPUGvM41bDrjRDaZfgSBU4sfKbY+4Sq2+c7loBSRLiNnpPul+GTahRtW/oVXOt/k9dRbnL2fzTP770i2tZqJwzWNtZq2VvlMFsXAsKEVHr1YLS8GESm/FKKXWPEOYsdWoBrzV4SxFbFjGiCeLezERK9TSuGbOKmgnHRYTiZM4pnEoF3HoLWiKt6MO29vrNfGYb+xOWXZR8DqGUTpzNto1TslkEEuTeuiuTzw411476k7Wfbqv4mCgCDMN/bwSJDT1eRUFb4EdOJT8nSOV1KzAQhMyIXNv+LI1YeSHv005W4GV15KoljEQ754cvfKidE7xNBxqWgoRRLUCdFbTIjVchjJ8z2SV42g/OpGnM1vx7ipQs9M9DKPBMddZjP+m4pJ31Lc8A/NoiYbLyor9NQ+l4SbRr93MqTzJ8KEbeh3D0eVbwN0D1SjhhPxg43/6oicciYfdFr+hjE0vf0Cfi6FcqVmSHx2MduiIrJIYpEgwfTKnbrdn41yTE5MkDWZoqj4QcjIL9occWc92x5Tzn6X1bDPxdUEllfoqYkCkPJLIXpJLkwQr/gietGuXWPRsNPJBYPzRF9smGXbXH+/5pGX8sGOH8BFt8OMnaAtrWistAmCgdsmvbrcw1b5Cw0hCdZ0xHG0h+p8vvsDjY/xV2MmP4JefDZEOaIRZ5OzpxBmN/7E2A80E/b5HlWjJvP+03dSN3F7xn/5GHKqAmTdnPgMqrRD9PBK/MdWoofESB4/jqOHfZX52YXMbHmUSjvJ+SNPpTwol7XNougE2ic2XDHthDhRFJELciCv85IkQZ0QvSSdixMfcSameg9U6+OYmr2gbBvSHdKFqpj4ocVTs3qWWb42D4ZUKfbaxgYGZlBXV9GGaroFtfRSiHysoScyZMRZZLw4duVuqJb71j1YxTB2I+3BZiTGz0RhyPhV+L0Q0K2VNWXUbLkfdVvsBdol40el16ZRbJSYbRE9ugrvjoUAhCuyZM58nYprt+XcxlM5e8T3wUDcTxDmPuGHCTFIRZHB8wbm947oP1J+KUQvqUh0oN49HLXwZ+AvR71/Jiz7NYmYfNAWk5gL07fuOb71JHh7MeQG6OF2XQuVm49adC6EnWByqOVXQdvjGF1BNPZKqPigZM1pIJp0L+lcGUEQ0ZEppz1TgR/0fulaEER4oY3nD76yOKUgFyVYnUrgmTiWLddJ+5vjQfDkqu6DuYhocRrLc3A6EzipBKFUowkhipx8AwnRS2yVQbU/k7+RmQuAzr1HfOiPyeQ2vgW8GCCikIN3Mcyer/j7U5CMw5nfyq88S8YMjg4ZiM1OYzEHVjzYY1yt+Qdu5UG0pmopH3c3tvaJ0KS9Kjxp+f6RtDLc/bTDD662CUKorYRHf6MZVRcNmHVblm2TCzRxOyQIBuCLshdEFvnmVIu6b1aval2MZH2FECVEMnVC9BqLDzeVQJdJaXuR8f2AmgrD7tPgsd/C3y8E24KcD9/c02DCgRkJeV4Albv0GDcVuxCYGFFkaM+U05KqoTUlAd3H0VqR8mL88JoEa2OllnY47lKL7ADY9FdrhU+SK++JccwlMe58MkGginNtb1ZFuMduAuXrrlHbezQQJrVU8oqSoVR+W5q1LK1xghg6HcMNY9haTvdLgWTqhOglXpgg3nA8atUNXWPR6IvI+JKlKzauSnPATnHa0haWhokjwbZC4jpLNDCSND3kcgEVlV/A1B+NavoTYDA1+0Ld1+hsl8VGn4XWis5UiP+hUtu3FoIxqse1nf6WDeMcep7mxbfytx99WfHWQs25RzqoaHBH67atKfMV2ihCDWkVkUpC8ndbY5qyqAqHIK7ooDgzk0J8mBO55FqhdVnIkEkOKh7Q8b5i5g+byLRGlNVqDr66jsRICAZIFYHoGxLUCdFLOrMJrOG/wB7ybUi9iKn6Mh7Dyck2dUUnigyuylCf/PB4YebzaTW3V1E76iL06PMBg8GlNVVR6GkNOmEYUVnhMqQKmtvWje+7Q4RjGyjw6yDrrwvo1rrpX4ozj3CIqcEb1NmWprzVkLtsLtGSDHpMGRVnTqYjCa34qAYbYwb4m1CIXuRELs9d1cmc+/Llx5YLx9w3lH+cvopMa/69kG6J+MfpLXzjtiFgywW8Yib5WCF6UVs6yZpoa9rLv8+a3AQ6s4lCT0mIblo6K2juGEJzRz2rO6oIJaHxmRkDFW6Gf/86yzaTIO7CV3cN+d2pBm0Kf9LkWPlyrPVVJBj0bc6TgSZ74RyiJRkAooVpcpe8RdkH+ybKGrrPx7I08XgMz5SRDRPYG2j4Y9sax7F6vK5E4SgFUVZ1BXQAoQeZNRGp5u4XNzpWhBAqrHQMN4hjS06nKMlRFaKXRZEhiuRMWYhi5oUWmwyNmHlhhggb1w5RJhgQgUV5IuLxKxXtKcWDz8MfH4DLTjIkHI/B/NGkQjBN3YPmaFEay1DwkteBTimwbYsoMt0a+QTEWdlm8/JbikvugDCCs79tMWN7H5scWityUYLHXlEsboJDdoXKWA41QLdtKQVO5BJlFNmOCK17ft5kWiOqRlm0LV73Zq8bZ9M8z+e+H65G27Dr/1UxYYZLMIgz96InydSJoqG1wrLkJS2E6B85L8TWAa7OQuQPiIAuIM5/Zlscf6nimIsgGTfM+7PhK9t4ROHgPREPcWnzNKra6TauGuNEEtB9rBCXlR1JbnggzgvvJPApyzfWsG2eet1mySrFdy6Fd5fAgmVwwmWKd5c6WJYiGyXY+8cW37pQc9Z1mqlHaRatjsl3bYE4kcuL16f44/4r+fPhTfgZQ1ld92PRvsznwCuGMGRi/r1SP8lhxsW1/OfqdgCiAJ76dRvkrH6fv+hb8q4Ug55SEI9rgiBNW9sq4nGFbcu3vBCi9HRmbb56jmL+UljTAVf8VXHPU/ngbjBL+w5H/c4hfcpUVFX+ZFXVusTPmkzalnV0H8W2LV5422HLYzRnXa/Z/yzNCb/WeFGcVM6hqlzx6Ms9n3frQ2A5Md5drJj7/rrxIITzbtJ4odPzSaLPBSnFG39Ldd1+6vI2vnZtPZvunaBhM4fdf1RF7SYuD5/fwnbHVfCNm+vZ/cdVrHrHo3le96xc29IAreVcqZhI+aUY9FxX8+KLz7N8+XIAtNZ85St7Y1nxgu8XZVmK8lgblvIwOGTDGoJQ4fuDuAZKCDEgxWI2T71Aj1b+9z6t+OpuDnE9OEuttFa8swgefllzHOX86tStqY5FpEJNfbVGhdKN6qN05lzOur779fuHXlB05izKXMPSJpgwsufzNh2lSOWsDW5tkslBZMCSeKBfaa3obOp+7rDiTY/HLl7DHmdVs/wNjzE7xbj5oJWYEB48uyX/PBu+dWdj13OchEJpqB5t40eDN3svepKgTgwq8biDUiHGaLLZAKXA97NdAR1AFEW89tqrbL/9TgVrAmHbmqSbwlGrUW8dBNl3wWkgOfE2TNCGKd+OzmwNXiDJciFE7wiCiKlje45vPs5QFosYrLsZRJFh01H5/3/oZc1DL8cAOOEAw4XH5gZ9A5i+lt5AzOv7EDmKS/8M/74CdpwKz8/J3/eFifD1PeC+pxWHTTc01sLKlnXPPfOIiITjE0g80K+iyFA9xsZJKPzMuhf9iGkxXr2jk7n3pznijgYwoDRse3QFE/ZIoDQkqjU7nFjOyK3jaFuRqNaoDxoqDYCqcVwv9+cAACAASURBVNFLJKgTg0Y6nWbJkoUsX76c+vp6xo8fTy5nyGa9Ho/N5XIU6pvedUIq1Gx0eiEsuyIf0AH4q1DvfB01+V6YNZXyLWexJhw2INbhCCEGB6XAM3FyvkUYQcKNiFvZrgYYoxoMR34Fbvt3Po0yaTT85AhgkGezkq7PH85wOf1qRWcGdt4CzjnKoIwvMd3HSMZ8Tvu6xY+vWZdWmzIWqssjOjKaQ78EF94Kpx4Kvzohn4Fr64SX385fDHBUluevjfP7+xSLVipOPiRi3LCAIJCS10KIbJ8j/95A58oIbUG2PcKOKe49uRmA+U9l2OaoCioaLTpXhfz12FVEAQzbyuXAK+q45/vNNH1Qhjlq+xj7XFyNpwvfsVf0DgnqxKCQSGhmzZrFggULAFi2bBkrV65kxx13wnEqicViHwRyeRMnbooxFhRgA9ryWCd61pdhyoPQ+Ur3O8M2QEGURi+/gljDr8nK56kQ4lPyTILTrrK4+6n8Sfp2m2nu+WUZDvl1NjGV4lcnlnH+sZqMl9/KoCKWxh+kWbq1LDwO2iniK9u5hJHC0SFxKzfg94YstDAI+Ob0gE2GWdz2sGarCRHf2Q/iOouJJ6ithDfmw+EXQMzNPyfnwfVnGHbfyuD7IQkrxZnftImMQpmAMJQwumAizco5Pk9c2kquPWLqwUkm7VPW9T548cYO9r6whoZJLn86dGXX05a/7vHSrR00TnW7grrFL+Zoeiugdktd8KUqondIUCcGCcN7773XbWTFihUEgY/WLnvvvTezZ8+ms7OTCRMm0NAwlGy2/wM6pUB5SyDKQOZtKN8OOl9a9wC7BtZujhul0etVX1qWpsztQKsALywnI52phBCA61q0ZxOEEby9iK6ADuClt+H2R+CEGTbBB/VwLmlcB8o/6GUx2AO6tUwUENdBV4s3Ceg+HZssu2+u2WWqhaUNgR8QRVDmZDlseoIpYxUPvaDIfVD0UlcFe28POsp0ZUEDX2otC00piDo1//pJU9fYG3elqB5pM2GPBO8+ksFNKmpG2Sx7vefV4uVveEyY3n3v3JYFPkO+4Mh+pUVCgjoxKCilsG0bf72zE6UUvu+TSDhks5rNN/8CxhiMUWSzhfq2Vyi3HlCw7CrY9HZ499uQngPuCJh4M6x5ANCoxuOxdQpI4joRFfod9PyTIbcIu/5IYkNPp7UzWaDfQwgxENi2ZlVHgn3PUGw6Kr/e6cNenKs5bm/pWiE+Wr5cMiJY7+Q9DCMqnDTTJsR59lqLq+6GYbVwytcMSTsjGbkBxrI0S2b1DNYWPJ1l34trmXZ4OUrDy7d2ss1RFV33xyoUUw5IMnHPBM0LPOyYIsgZUDBu9wRBIOVCxUK6NIhBQSnNlltu2W1s4sSJLF26FABjDJ4X4fumoLX+cTfIZ+ZGnQ+jL4CWmTDqPJj2Nkz+F7T9F+ITYOt50PxXbPKXRstjHeg5u0Hni+CvQC/7Ndaqa4m58qUqRCnLBHFOvTK/f9ir8+BLX+j5mG/sGWFpudQuPrsoMjhkmNCY4spTcvz0W1mSdkrK8QagMDQM2dTtMT50Cwdlwcu3dvDXY5p497EM7/03wx5nV9Mw2eHQ6+vxM4YnLmulY1nE1/9Yz/AvuBxyTR1WRSiNUoqIZOrEoJDNRowcOZKamhpWrVpFbW0t2WyW5ubmAVU2oAlQq26F6r0ABYt/DmMugea/weq71z2w/ihIbklorPw+Mdl5+ZLN9X/W6r8QqzuRHJX9+jsIIQYOL1C8/r98Fm7VGnjxLbjyVLjk9nxr+VMPM+yyRSSNK8RGiSIDkY+8igYuYwyJIbDNUeW8ensnJoLGKQ6b7VPG3d9t5gtHlDP8CzGeu7aNprd9Gg5yOPh3Q7j3lGaa3slXOTW949O+ImD/K2qJYj6+lNUWFcnUiUEhDCNc1yWZTJJMJlm0aBFhGLLFFlvieQPnaygXJogaT4bULFjzr/xg5c6w+p7uD2y6A1N3CKlcdf7L1Bne42eZ+KaEpudVOSFE8dAf+ha2bQvfxFCWg1L57pZ7brPuUvp5N8F/34Bnf294/Y8h3z8wh20yCCGKn689ph2T4PiHhnLsP4ay7dEVzDxtNc3/83n0F2sYvpXLt+5sYOjmLs3v+oSe6Qro1pr37wzGIPvlFiHJ1IlBIxaLkcmENDYOo6FhKGCRTg+sDgBBEOEltsetT6K992HVH/ObwCgLzHpXxJRNZFyCD0pcfFODGvoD9Iqr8vc79URjriCTTSCbMAlRfDxTRmdW09IOIxug3MmRDW3++azFbf9WTB5jOPMIl8p4hotOLKO10/DA84oxjXD8AYa446OiHEbOy4QoKb7yiSUNL16TZs7MdLf7FjydJdMaMue+/PjYnRNoG9bfYzxRrWUrpSIlQZ0YVHw/XK+T28DJ0K2vI5PAtnegsmoSuuE4VOvD0Hg8rLiu6zHR8DPI+GXrnpMto7zhZ7jDToOgDWM30JGrymfxhBBFxTNlnHOT5k8P5csq66rgmatjPPay4QdX5seemqV44HnFf66Ok7DSXPejOGGkiQxUJ3J43sC6oFUItm2T9ixiTgSRL2uDRMmIjKFhisOcmd3H68bZzL5nXeOTdx/LsMPxFTx3XUd+QMGeP6uGWFCIHZ9EH5OgTog+EAQhLR0VJIdeQszOopSCusOg9Umo/jK+PYlMxun2nM5sGVAG1EMOJEMnRPHRGlpa1wV0AKvb4MxrYdetutdiLloJTW2KEdURDmmcD+72vP6c8cAUqDJufUhx7zOaaRPzWc2kky7Zjo22nX9xyNrK0uD7IRP3SjB3ZpqVc/MXeEZs7TJsqxgPn7em63Ev39rBwb+r49j7G1m9wKd+kguxAD+UtXTFSII6IXqZZWkSTidaBWTDKlKd8Q/Gt8eq2ZEgiIj80jzxEKLUaa1ZvKrn+Pxlim9+ued4Wazv5zToaIeLb9dcc08+MH5+juLJ1xQP/TpOTJfW+kLb1qT8Mt5YkL89aRSUu2l8X4K7YudbOQ64qpagE1DgJEEZcMsVXmf+HENbUD3GhiqPIVtDEGYLO2nRpySoE6IXxZyAcv0/9Pung9+MM/RkgqrDaEuXEYaGcCC16hRC9LsgiNh8HCRi+e6Va31jD8PUsWBZqquj71d3M5TH5TPjw3Khw80PdN+X762FkPY0sXiBJlUgqaCMPU9T/C+/uw/jR8Djvy0jpjoLOzHR56LIEOkcaxtke4ClNUfd3cgbd3USZGGrbyZR5QGhLOUoCRLUCdGLkm47etYuYPL1Ufq9H2CPr8SJfQ1fymKEGLAsS5MO4qzpUCRiUBYLceibq9oJx+OJK11Ov1qxtBkO39Nw3H5gk+Pt212eeR0mjoSxjRGOkivrPRhDfTUsXLFuSCmIldgZTSLh8Kd/0hXQAcxfCnc+Bicd4JDJyLrLUhNGEaGb5QvHxTHG4Ps5JJ4rHSX2EShE37FtDR3/7Qro1tJNN+GO3Rs/KPuIZwohCkkp6PASTD9VdwUKR3xZcel3EziqD8r5Qo9Nh4fccV6MKFJUJAIiP5+2q3B8DtxR56/Cy9nYBiVdjyt/qDnkZ7qrOcrJhxhcu7TWCWmtWbiy5/j7y/MXKUTpyuVK670g8iSoE6KXRJGB2Jge4yY+nkj2mxNi4NIOF9+uumV+/vyo4gdf04xvVH0SXHleSEKnQUP0oYSKNLv4eEEQst2mPu/c7vDCXJg8FhqrQ2xyn/jcYpLN+hw7w+H399Kt8+dx++XvE0KUFrmUI0QviSJD6IzDVM9YN+g0EI04l6znfPQThRAfK5GwiFT+v3i8969FeoFm7vuqx/i7SxTL2pJkTTlGy4WZgcTCozKWYp9tM4xrzBGPaXIkCVQZ6NK4Xh2GEY01IQ9cZthlS9h5C/jXZYZhtaFcGBCiBJXGJ58Q/aQ9U07F2D9iR80QtmJi4+jIVGGMfMEK8XloW/P4a5pf3BpHKbjgWI/tJxtM0HsNRBJOwOFfdvjv7HWBnWXBuBGKHb+bL8+8/xKXbScYcjnJgAwk6TBBU7PizfcUO0yBIATHilOTzGHC4jtWSoFS67LHDhl23MzmjnNcDFCV8KT0TogSJUGdEL3IGGhPJ1EqCYzF+IaBukm6EAOd61q8u8zikHPW9fU/4GyXV27UjK7L9tqeZEEQcvAuIavWWNzwD8WQKjj/WPjjPyH7wRLZEy5VPHGlS8IqvkBhsPJVGVf/XfHrO/PBuFJw+znQmVHss0OMmCquYxWW5WiljRa/jbFlo4hlE0SBIZcLiOl8IJcrrQpUIcR6JKgTog/k1zdIkwMhNkbMhZse6Fm6fNvDil8c65BO994u3LbJcPJBNsfOcAgii6N+qXhuzrr7F67MZ0jEwOEHmt/8bd0xMQZ+dA08+lt4Yz7sOEkThsVxUS0oy3He0t/wQOsTAFRZFdw/+UZqolppqCOEAGRNnRBCiAEqjDRTRvcM3DYbHfbJyboJAxJWBtuCFS3d79tne4PWxREgFAvPhw9v/dncBo4FazrymbtioBSsNi1dAR1AW9jBxUuuIXSLKxsphPj8JKgTQggxIKXTPodNt9lsvaayUzeBA3bWfbpuqDKW5cFfG/ba1jCkCr65p+HaH0Ncy55xA0ncNWw5vvvYN/eEN9+D3bYyRdMsRCnFSq+5x/hibwU+sn5OCJEn5ZdCCCEGrJiV47HfRCxpUiilGDEkwlUevdcmpSffDxhSnuamM+P4oSbhhthkiIojRiga5W6G+y8p41d/glfnKfbZHo7b3xAGUGZni+Z4RZFhcvl4EjpOJlp3YeEbdftRFiYIpNRfCIEEdUIIIQawIIiwyDG2Pn/bmJ4ld33177oqjWsDkbQ7GoiCICKhO7no+BgZz6Isbgj9AOyg6NaZxXIJ7t/sBn6x5Hes8Jo4fMiB7F+1J0GquH5PIcTnJ0GdEEKIAc/IuavYAGMg9HO4CoIi7vwY+Yqh0VB+O+J8QhWSCMsIJaATQqxHgjohhBBCiAEuDA1W6GIBoZRcCiE+RBqlCCGEEEIIIcQgJkGdEEIIIYQQQgxiEtQJIYQQQgghxCAma+qEEEIIIQYIu8KQVhnAkIyS+J2FnpEQYjCQTJ0QQgghxAAQVma5sfkv7DH7CPZ68yjuWHMffmWm0NMSQgwCEtQJIYQoepYFyaSL61qFnooQGxSPO8xKvcVvl91MZ5SmLezgoiW/5/3cEnndCiE+kQR1QgghilqCDlTHIhY8dDXp958nQVuhpyRED7areKj1qR7jD695mljMKcCMhBCDiaypE0IIUbSSCYsVLz3B05d9u2ts3PQj2PrYS8lQVcCZCfEhkWJacgp3NT/A+Phodq3cntawne3KtyAIwkLPTggxwElQJ4QQomiFqSZeufnsbmMLnvgz0466EEq8ok1ryIRJcr4iCCEZh3I3g+9LAFEInZ059q7ZjYROUGknmbn6UYY59Uwrn4qflmMihPh4EtQJIYQoWkop/HR7j/Eo8Es+qMuE5Zz6O7jvGQXAtIlw38UJYki7xUIpy5ZRY1dyzLtndI3d1/II/9rsj9id8QLOTAgx0MmaOiGEEEVLxZJMnHFit7HacVthxcoKNKOBIRazeX3+uoAO4LV34cZ/QKJMgodCSakM1yy/rdvYSr+Zedn30Fp9xLOEEEIydUIIIYpY2nOZevBpVA4bz6Jn76Vm3BeYfOApBE4teKVb0uY4Fq/N6zn+xnxF1pPrvYWiUMR1rMd4XLsQFWBCQohBQz65hRBCFLVUVMGI3Y5mp/+7mSmH/Yy0qcIr4YAOIJv12P+LPce/trsh4Qb9PyEBgOvHOWP4iVjr1QZPSoxjtDuCKDIFnJkQYqCTTJ0QQoiil8n4QDlkCz2TgSEIDPVVEbf8VHPujYpUFr5/iOFLW0M67RV6eiUrDCNGWSN5cvM7eWDNE4xwh7JT+dY46TgREtQJIT6aBHVCCCG6aK0oT2SxSOGZWlLpQs9I9BVXpTnoi3F22yp/KlCRCAm9zIYf69qkPAelIGF7Rd0hU2tQSUOGDJWmgmxn//6uyrOo9Kv5dvJQjDEEnZEEdEKITyRBnRBCCABc16IythIWnovKvIlVcwCJoSfR3F5R6KmJPpLLZin7oNIv/IgEnU+C2f+zuPwvCseGc4+2GDXER5tc/020n7iuRWe8nWuW3868zAL2rtmNA2r3xG6PY/oxrjKGog6chRC9T4I6IYQQAFTEVqPe3BNy7wGgUq9j/CYqh/+S9lTP5g2i+DmOxYJlFnudvq7z4kMvKGbf6lCbyPVroNMfOuPtHP7OaczPLgTguY7XWOE18f2Gb2M65JRJCDFwSaMUIYQQAKiosyug6xprvgNHSw1mydIO18/s3ko/COGOf0M87hRoUn0nHWW7Arq1/tL0T7Kq+LKSQojiIkGdEEKIPJ0APrQXljscWc5TurQyNNT0fAEMGwKm2NJ0gKt6BqrVdiVREf6uQojiIkGdEKIgYq4m7hjZUHcACaIYZtgp6waUjdnkd6T8IRt8vGUpkrEcyXga25avk2Lk5XKcdDA01KwbGzcc9t8Jstni2/rAxeHQun26bisU54w6haqosoCzEkKITyYF4kKIfmVbikrVgnruGlTHEswOPyRbPo5UGC/01Epea6qc2mFnohuPh8w8KN+GbFBBNtOzYUPcDUjq91GLz0WFnSRGnEGQ3JrWVHkBZi76ijFQFc/y4h/iPD8H4i5M2xSSdpqg+GI67I4EZ448iSMbvsrbmfnsVLE1SVVGrkN2/hZCDGwS1Akh+lWlbsO6bltINQGg3vgz8WMeI1e7A0EgJ06F1tJZidaVWPZ4/I85kU06a9CzdgDjA6Dan8Ke8hDlZdvSmXb7a7qiH/h+QFx1stc0K99iP4iKMqBby2lPMM4ax2ZlE8l0+oWejhBCfCpSLyOE6DeWpVHLXu4K6NbSz1xMzKQKNCvxYVEEvv/RAZ3jWKg1M7sCurXUyhuIWbK7d7Hy/bBkLryEYfTBhvVCCDE4SFAnhOhHBmMneo7aCVDycTRYGGPyDVQ+zGmAKI2SZZJCCCFEv5KzKCFEvwlDg2nYHIZsum7QcjDTLyATyZq6/qS1Ih6zcF3rMz83CCJMxa5QNnXdoD0EGo4lUsmi27tMCCGEGOhkTZ0Qol+1hZVUHfMEJtNKYJXhJCrIeJoolEigvyTjWWLRe+iVf8DEJ2KGfJu2TBXhZzgGa1KV1E5+CNKvofzVkNyCSCXpzEmjlM/CsjS2rYkig+/3bEgj+o5t6/yFJrkKIYQoAhLUCSH6lVLgOdXMe28VTU1LGTp0KOPHj8fKhoRyTtvnbNsi7r2AfvsA4INd6Vb9gcrJz7AmVfGpf04UGZo7qiiL74YbzxBGkM5VEIalseaqNwTEeW+FxSMva4bXGfbePsJVacl09rHOMIWp9Hgr+z6VdgVDdB2WNPcRQgxyUn4phOhXrqt47rnnePPNN1m5ciWvv/46L730ErGYfBz1h4Tdhl5yYffB7AKUt+Bz7RmYzmpaU0k6MkkJ6D4Dx7bQmZVEb1zPl5O3sOMmK/jlrQpPypD7lE5ELM2t5JoVt/FM+0v4KmBm28NEcWmKIoQY3CRTJ4ToZ4bly5d3G1myZAnbbLNNgeZTahSoDayjU5ZkiPqJUqCyq3j0/7bCT7cDkKgdxg/PeYVs0IjjFHiCRUprRZNezT5zjsY3+T0Zbm+aya2bXo5necSRP7wQYvCSS+NCiH6ndfePHsv67M06xOeTCSqIRl3IB4WXeYnJRM5YWVvUTxxb8/b9v+sK6AAyLctpfeN+ElIF2GeUY7h2+e1dAR1AS9DKSx2vY0nLViHEICdBnRCin2mmTJnSbWTLLbdEyZYG/SIIItLWVoRbvUE0/EyicTcQTH6U9synX08nPj+lwMR8/Exbj/u034ZrSwlrXzL0vHChlcZFomkhxOAm5ZdCiH6VyYRMnDiRUaNG0dzcTH19Pa4bI52WLin9wbI0gWdIMRQ15OeEYYTfGcIGTnZF7wviHr9a9ntO3v9Y5j9yKybKv+6tWIJJe34dP8gUeIbFy/iK7w89kpktjxCY/N+91q5m35rpqE5ngwGfEEIMFhLUCSH6XTodYVlxRo4cQy7nk8lIdqI/OMojbF1C0/9ewRhDzditcOrGArFCT61kBJbP31Y/QKJKc+z/e5yl912PFUuwxaE/IXJrMPJW6DNRZKgLanlyizu5beV9VNrlHFa3L4l0kjCSgE4IMbhJUCeEKIgwjEin5Qy2vyiVL+/rXL2Uhf+5G2MiEtWNOMlKdGI4kZzU9guFwlUOt7b9k0fcF5nxtV0AmFBbjd0pJch9zrPYpGoUp9UcjzHgdQSEkqETQhQB+QYRQhREXHUQD1cTD5uJ04GWT6M+5bo2XkcLj/38AJa89ABLX36Ixy44EK+jBdeRP35/cfwYPxh2NADLvFXctPoeGuNDsX3JlvanXC7A84JPfqAQQgwSkqkTQvS7MtXBK388gwVP3gnGMGqng9jx+78njTTr6CuWpVnw5J8x0XrZUWNY8PhtbHns/4OcrGnsDypncUT1wXy5amde6XyT7Sq2ZAh1kJHAWgghxOe30d8ic+fOZfPNN++67XkeZ5xxBjNmzOCQQw5h/vz5G/tPCCGKiOvaNL39HAue+DNrN0Zb/NxMlr78APG47BPVV3w/JDlkVI/xsiEj8X3JWPQnK+MyPDeSQxL7MjQ7HCsjnReFEEJsnI0K6jKZDBdeeCG+73eN3XbbbSQSCR588EF++tOfcvbZZ2/0JIUQxcNxLFbOfqrH+Mo5z2BpWdvSV3w/ZMxuXydZvy6wKxsygnF7HIXnSZauvxlj8P1Q9gYU4jOwLI3jyL6mQmzIRpVfXnLJJRx99NG8+uqrXWNPPvkkp556KgDbbbcdLS0tLFu2jOHDh2/cTIUQRcHzfEZuvz9v3X9Vt/FR2+9PILFFn/J0NTN+8zwt/3sVg6FuwjZ4ugpCaVgjhBiYbFvjOTkiO+TNzHwWZpeye+UOJP1ylCcBnhBrfe6g7rHHHiObzbLPPvt0G1+1ahX19fVdt+vr61mxYoUEdUIIAHw/omr05mx5+DnMvec3GBMxab/vUz95Z9I5KQPsS2EYkaGC8k2nA5AJIwnohBADVhTzmR3NxwQRVy6+hRc6ZgFgYfG3za5mSmwSOfneEAL4FEHdgw8+yMUXX9xtbNy4cXR2dnLLLbf0eLwxBqVUt9v6M7S1q6sr/9SP7Wv19dK0YaCRYzLwfL5jUsHmh57BpBknYjA4iUpi5dUke312pUfeI59eFIakVy8FE6Fsl2Rd31x8lGMy8MgxGXg2dExe73yLE989m+sn/KoroAMICbloyTVcNOYMRtcMo9KW49kXBsL7JPAi/IwhzBmCHNhxSNRYWLb65CeXmE8M6mbMmMGMGTO6jd11111cf/31fOtb3+oaO+igg7jjjjtobGxk1apVjB49GoDm5mYaGho+9YRWr+4cEPsl1ddX0NTUUehpiPXIMRl4Nu6YKKAagEwGyMix3VjyHvn0YpZPavHr/OeKo0k1LaZhyhfZ9YzbCWINBEHvZS/lmAw8ckwGng0dEyemuaX5bhzlkImyPZ7TEaaYlZpLLvQZ648ZEOeOxaTQ7xOtFU4QY/4TGWpGOzxwVgvploiyWs1BV9WRGBUSFmmlidbqcyW5PlejlMMOO4xHH32UmTNnMnPmTABmzpxJeXk5u+++e9fYyy+/TCwWk9JLIYQQA4oOUjx2wYGkmhYDsGruszx31fewg7YCz0yUCqXyjaMsS7az2BBjhzQ4tbQErVRYSYa73RMExzUcxkh3GHet/qc0TylClu/w2C9bGTo1xkPnrCHdkg/g0i0R/zi9BSsnXYM/rNf3qTvyyCM577zz2G+//XBdl8suu6y3/wkhhBBio/jpNoJMZ7exlW8+jTJBPoksRB8KcWnpcPjnszBlE9h6osFV6bW7vKC1Ir+SRRFFpiS7pKZUmn1qvsTfmx/i5wuv5PoJF/H35gdZklvOvrXTCU3IM+0vMaPmSyX59yl2CsUO36lEaUXnqu5d1DpXhvhpg11tEUTSYW2tXgnq3nnnna7/j8ViXHrppb3xY4UQQog+4ZRVYrlxQm9dWVfdxG0xG799qxAfy7YtXpjrcPDPdFcQt+c2hlvOShCzsniJLGuiNcStGG+m5zEmNpJGPQQrHSvsxPtZYEJ++v5l3Dflel7seJ10kObI+oN5vO05blt1L7NSc4mrGMc1HorXLif2xUahuef7K9nr/BqqR9u0LlrXEKdmjE3LewENU22w5divJd9eQgghSk5kJdj1x3/CKasEoGLYeHY+9UZCt6bAMxPFLuU5nP2HdQEdwGOvKNozGq8sy0XLr2aJv4Lpbx7BSfPPZd+3juWXy68mTOQKN+kCiJkYE+Jjeaz1Wf7f0puosMvZa85RXLTk98xKzQXANwFEklovNpalaJrnkW2PeOHGdvb+RQ1DJjoADJnocMBv6nj7wRQYOfbr6/XySyGEEKUh4fok3BREWSJVRme2iiDMn6lalh7Qi9hzUZzaqXty0LWzCf0s2okTunX4/sCdsygOxijSPft+4AWw1FvOKcOO4ocLLiAw6zIQ97Y8zOkjvkMFpZOtc7Nxzh35Q97O/Y/3ckt4tuNVZtTszgNrnux6zDeG7IcTyNqqYmMMxCryeadVb/k8dlEr2x1XQd0mNqEPc+5LMXLrOFjyeb0+ydQJIYT4zBIxj7L0PVivTcSatSnO3F2ojK0goTqIBavILnqeRNRCQmcKPdWPlAsdMqoGzx1GVtVIQCf6RTLm839f774GbNJoGFJlWBE20RZ0sNpf0+N57WFnSTVViSKD6nDY1B3H9Mod+e3SP3JkwyGcN+oHHFC7J78dey4/HnYiKiv5iWITRYaK4ZphW+UD9uZ5Po/8fA1eyvDgT1uwXMWkGQlC2y/wTAcWeScIIYT4NdU4fAAAIABJREFUzBJOCr3gu8AHgVDufbxUK4uef5AXrzsVjEHbDtPPuZuayV8iU1qVY0J8pDAI+NpuAaMbLf70kGaL8RHH7w9WRSuPL3mWg+u+wleH7MM1y2/res4wp57ABHhOFissrcyUnYpx+aifsdqsoS3o4NDafflGxYEQaPxOWU9VrHzLY//f1LBmQUTLez5Dp7q8dHMHBsMXDk+SiTJIf5zuSueSjxBCiN7jN9EV0H0gpJ5XbjyDtd+0UeDz3FXfw2RbCjBBIQYumyy7Ts1w3elZfnhIDpcULX4rh9bvy/fmn8M25Ztz1siT2Do5la/V7cNfN7uaPzfdj6H0zmKNASsdozE7lEnhpkTtFn7G4PsS0BUzYwyezlE5KWTi3nHsBGx/fAVfv3kIgZuTgG4DJKgTQgjx2TmNoBPdhkIvR+h3T8mlW5bLDgEDjFJr2+WLQgqCCBP6BH6+q1+VVYGFpi3s4Ph3z+aVztnsWzudGruKNUErB9Z8mXiQ+ISfWryMQTYYL0FhGJGNsqgaD2eoh2flCEN5HWyIBHVCCCE+s85cOdHkf0N8HKAwNftjuTGqRk/u9rgR2+6D0k5hJim6cV2brCnnneVJlraWkzNJCe4GEDcXo86pYXx8DBERj7T+h18uvpo56XnU2jVMticSBB9kx+MhfnkGP5lBu3KCK0qDZOc+nqypE0II8Zl5vkW7PY2yzZ5Ba4MfuBjK+fIF/+SlP/yI1fNfZdhWezDtyF+Q01UwgDthlgKtFWvScXb/gWJJU35s+jS45adJYipV2MkJAIyvGaKH8KdNL+dnC6/g9dRb7FyxNeePOo1EJtkV0EXlHr9beTN/brqfuI7xkxHfZUb5dKxMaa21E0J0J0GdEEKIz8UPItqCivVGAix3KDuc/HsiP4uOlZMz8QG9tUGpMDrGr++kK6ADeOI1xdz3YdsJSsqZBgg/F1HhV3PF8HMJtY8TuehOh8Dk30O2rXmw/b/csupuALzQ55xFV7DtlC0ZpoZjJJUhRMmS8kshhBC9JgxD0lE5WWsI6SBO+Cl7GcTjDomElGn2lZyveX9Fz1rLd5fkAwUxcESRwco6uOkyVNbuFqgFVsC/257p8Zz/tL8kx1GIEiefAEIIIQom5igSZg2Ln7iBRY9eT8KswZUakl5XEQ84Yq/uWRzLgr23h1xOuggOFnZks335Vj3GpyWnSkZciBInX51CCCEKRnur+ccPtiHXvhqAWXdcwIFXvwq6rsAzKy65nMf0aQ6//QFcf7+iMgkXnWioSPgf3plCDGCBH/HVmn14su15/tvxChrNt+sPZqw9EhtNKtZBymSI6xhOaGOl4oWeshCin0hQJ4QQoiDKylzeve/WroAOwOtcw7yHbmTSoeeSTnsFnF3xcUyKI/eKccDONpY2VCU8crmg0NMSn5GdinPlqJ/jWx4ajRO42FmXtmQr33jnByzKLQPgmIavcXLjUTidpbsNghClRMovhRBCFIyf7ew5lulESa/9PuF7OZJ2irhOS0A3SBljsDMxEp0VxDqT6KyDSQT8eukNXQEdwC2r7mZN1IaWMz0hSoK81YUQQhRENuux6T7Ho511rdi17TBpv++RyeQ+5plCiPVlyPBOZkGP8QXZRViWnOoJUQqk/FIIIURBRBGoxBAOvPo13rz7CkwUsfnXfowqG0Ik67yE+NQqTAV7V+/KnPS8rjELi62Sk/E75M0kRCmQoE4IIUTB5EIHnRzDtO9cCURkfYtAIjohPhM/bfhW/cGs9tdw9+qHaHSHcMHo/yMeSKMUIUqFBHVCCCEKKooi0jkAhbRiFOLzsdpjnN54AicPOxIDVHqV5DKyXYUQpUKCOiGEEIOGZWksSxMEIVFkPvkJQpQQk7JwSQKQQwI6IUqJrJ4VQggx4CkFcZWi/e1H+N+9vyRsmotLptDTEkIIIQYEydQJIUQvcRwLO+wEbeETJwiklLC3OCbNi9eexOLn7gNg9l8vYsdTrmPoF4/Al878QgghSpxk6oQQohfEVIrckld44eoTePm6kzGtC3At2Ty7t6gg3RXQAUza97vUjtsKlVpGwrTiOFYBZyeEEEIUlmTqhBBiI9m2JrdyIQ/+ZHcw+XVei5+7j4OunwN2Q4FnVxyMWZf1nHLwqSTrR/PwmV8i9HOUN4xhr189jJ0cKdlRIYQQJUkydUIIsZEcK+St+6/qCugAQj/H+0/9jbIyp4AzKx7KKaNh6i6gFJvsfjgv3/QTQj+/QXnnqoU8f833sYO2As9SCCGEKAzJ1AkhxEZSyiJWUdtjPFY1hFAa0PUKT1Ww21l/Y/HzMwlyGUzU/Q/bsuB1CH2QKkwhhBAlSDJ1QgixkbJexJRDTsMtr+4aK28cy8jtZpDL+QWcWfEwxpAx5Qzb5Sgqh0/AjpV1u3/YtL3ASRRodkIIIURhSaZOCCE2UhRBFG/gwGteZ/msx7BjZTRM3RnfrgFf1nj1Jt8P0VYZe/7iXzz72+PpWLGAkdvOYNvvXEaOMkD2rhNCCFF6JKgTQohe4PkGVA0NO3wTMKS9UAK6PpILHZJjtucrFz+OUgqjLDyriiiUv7cQQojSJEGd+P/t3XmA1VX9//Hn5+6zMuwjsiiSGq5fRdPMCDMRUQmXwi+V2a9SW7BFDcu0TAOV0rZv+W2RIrC+lKmYaKbihomQaCbuIquyDDDr3T+/P8ip+ULfkoG5c2eej//uuXfufQ+HM/e+7jmfcyTtRtmsh6Z1hUy2AJG+f28w0EmSejGvqZMkSZKkMmaokyRJkqQyZqiT1KMFAQRBUOoyJEmS9hivqZPUIwVBQCJsIrv1DQrZNDV7jSJPlGIQJ5dzh0RJktRzOFMnqceJRAIqIi2sfvgWFl09mdaNK1n92G954jvn8uqd36IiaCIScfZOkiT1DM7USepR4kGW/NbVLPn1DAJC3n3pXJo3rOSx710AwMbn/sTaJ+7i3ZffRobqElcrSZLUeYY6ST1GEASELW/w+2ljCIsFAF5b/DsmXPcge4+ZwKEfvIz0to0ka/sTFDMQMdRJkqTyZ6iT1GMkElFWzP9Re6ADCAt51i2/j8M/9DXuvfxkss1bABh5woc4/KPXk6GqVOVK2gXRaEAm0UYxWiQoRkhlKil4TqGkXs5r6iT1GGEIydr+O7QPPmQsT91ydXugA3jl/l+Sa96EG2NK5SMajdCY2sqcLbfy64Y7+a+Nv+CNxOtEow5kSb2boU5SjxGGIfu+5xxSfQa2t/Xb7wiqBw6j+Y1Xd3h82+Z1HncglZFcKs22YjPpMMPixmX0jfdhW7GRTCpd6tIkqaRcfimpxwgo8NK9P+e9X1vAtjXPk2trYsh/nMhL9/2cEe88g60rn2l/bCxZSZ8Ro0kXPd5AKhf5aJ4rV97AE81PA/BY05Os7L+GL+19ATGSJa5OkkrHmTpJPUZIlD7DRvP0r2eQT7fw0r0307bldZ6+5RoG7D+GQz54GTV77Uf9IWOZMOth8pHaUpcs6S3Ih/n2QPemOxr+6Iy7pF7PmTpJPUYuV6D+8BMpZFtpePlJNr+4jJZNa6gdMor7v3EG+777Axx2zuVkmreS6juE1qIfBKVyEgtjRIlS4O+bIVVFKgl2w1hOJKK0JVvIkacurCPdnO/0c0pSV3GmTlKP0lasZK+jz+CAiRcSRCI8e9uNHH3Bd0hU9eGVB+bx2A8+ReWAYeRCl2pJ5SaWT3B+/X/SL1ZHMkgAcPnQz5DMVnTueatC1sbW8oXXruG8Fy9h7tbfUeiT2R0lS1KXcKZOUo+TKSaI1wxhwqzFPH3LVbz+1ANMvHEJxXyWaLKafLSKXNE/f9p9otEIEFIo7HiNZjQa2emW+0EAsViUYrFIJBKhWAzbHxeLRYhEAnK5ImHodZ9vihVinDfoLN5bdyx1sVoqggpS+RTFTu6T0hptY9Jfzqe12AbAN1Z/j2gQYUrt6aRbCv/ipyWp9PxUI6lHyoUJYoMP5qhpPycE8pEUhVhIDsAjrbSbJBJRYtnNNK9bSRCNUjVgOLl4P3K5AklaaHjlZRrXv0r/UUeSj9WSL0YBqMhnSGzZRO7ue4kdPJrIoIHk16wlOuYIKIbkH3qcwjN/peb9p5MbXE9b1JnlWCzCutg6Jj9zAS3FVgA+Ofgczu8/tVPPm0hE+XPL8+2B7k2/2bSQ8XVjqaC6U88vSV3BUCepxyoUihTe3BFvJzMoUmfFsg3cPf0Emta9BEDdPgfzvm/cTUCMJ392Ma8umgtAJJ7g5GsfIl5/MAEQfehRtpz19zCSOHUCyVMnEAZP0nL9jeQefASA1q99k9pbbyH2nhPI53vvtxFhokBzsokvvzKrPdAB/Pcbt/DRQWdSSWKXn7tQKDI4MXCH9r0Sg0gS3+XnlaSu5DV1kiTtgpqaFC/9cXZ7oAPYuvIZVi+5k2Qq3h7oAIq5LEtumkYk10SqeRstl17e4bmydy4kdujBBFVV7YHuTS1f+Tqp5sY9+8t0Y9FohDWRtSxvWcGazPod7t+Sb6Qzm18WCiGD4/05se649rbaaDWXDb2QVKZy159YkrqQM3WSpB6jqipBoekNCAKCyoG0teX23IuFRZrWv7xD8/aQt2PKaNvyOoR5AiKEzS07Pl8+R1jcyWxcJkPQi6+ry8YzXLf2JvZJDuXkvmOZveE37ffVRWsZFOtP2Mlr6hLNFcwYcSkX772FjbkGDqwYSUWuimzW6+kklQdn6iRJPUJF0MTGpxby0MwP8PB159D4wsOkguY99nqZbIG3nfT/OjYGASPH/SeFXJ6KvvUd7nrbSR8njNeSqaym4qILO9wXPWg0xU0NkM0SHf32DvdVXPo5MjW990zFkCLpYob5m+9iQt+xfHzwBxmaqOf42qP43YE/IpXu/GxasQjxxgr2zgzhyOAw4o2V5Nt6b5CWVH6CsJttq7V5czPFYulLGjiwho0bm0pdhv6BfdL92CfdS2/uj0QiSmbtcu764t+X0BEEnP6D5dBnv53uPrk7VASNbFzxKM/8z7UEkSiHTb2SviOPIBupIZrewFNzvsq2dS8w8j1TGf7uc0gXtweQmnwbxQcfIjP318QOO5jUB84kvWAhqY9+CALI/OIW8sufJnneh+HQQ2iOdW7L/nIWi0VYFi7n3JcuJhUkmTzgJMbWHs2YqkNJtFT+W58ZIpGASCSgUCgyYEDvHSfdVW/+29Vd2SelE4kE9O//1jdoMtT9E/5n7n7sk+7HPuleenN/1FQnWPrDC3np3ps7tB901iUcMvUampo6uT7v/1BVlaTQtB4IiFQPprU1C0AQBPSpCmhpbIRkLblcx2AZi0WJZ9OEySTkcoTRKNkihCHE4xGixSI5dn4cQm9TTOV4rbiamzf+hmHJIZw78Awq26r/rc1jCpUZVufX80L6FY6tOYJByX60Nfhv2p305r9d3ZV9Ujq7Guq8pk6SVP6CCNWD99mhuaZ+X4o7u05tN2ppyUCk3/Ybfwt0AGEYEq+oJtdchNyONeTzBfKR+N/ui8I/XL6VyxW3H7/h+RsARNJxRkX345pBlxKEEfJNRfL/xr9NviLLN1//PrduvgeAgIDZ+1/PkbHDe/VuopJ6Hq+pkySVvaamNG876bwOwa7P8Lcz7B2n0dKS/ec/qLJRKIQUMpDP/vthLBvNtAc6gJCQr712I22JnWxUI0llzJk6SVKPUEj2Z8L1D9O49gWCaIyavUaSj/cDdzDstXLhjrufbi00EUZLf5mHJO1OhjpJUo+QyRSAPqRGHEMYQmuhYKDr5SqjKfZLjeDl9GvtbR8YMJFo4EIlST2LoU6S1KPk8wY5bRctxvjuyCv4n0138XL6Ncb1OZZ9U0PJFfNES12cJO1GflUlSZJ6pFg2zrKmv5IpZjm65jAealzCgs33ES8kSl2aJO1WztRJkqQeqZgNmFhzAodWHci92x7hvIFncUTNQRS3+vFHUs/iXzVJktRjRdsSjIzsy2f6jAKgNl7BRjx/S1LPYqiTJPUYqVScaDQglyuQdZMU/U0umWFtuJHHmv7MUcGhDKnci2irSzAl9RyGOklS2YtEIBU2sn7J/Wx67k8MO+Z0+ow4hLawptSlqcSCZJFbGxdyzZoftLddWD+VT9RNJZLxY5CknsGNUiRJZS9RbORP//VpHpn1EZ6787+49/KTefkPPyUVz5e6NJVYJp7hhnU/69D24zd+RT7uofSSeg5DnSSp7IX5DKsfu71D2zO/vZ4g67VTCskUOwa4fFigSLFE9UjS7meokySVvWAnbWEYdnkd6n7i+QRnDhjfoe3EPsd5rIGkHsXF5JKk8hdLsvdRp7D2ibvamw6a/AVI1ECuhHWp5IJ0jOl7fYrDKw/ivm2PclztGN7f9ySizQmM/ZJ6CkOdJKnsZSO1vHPaj1m7bCEbnn2UEcedSd+R/0Fbzrc5QaQ5wemp8ZxSdQL9qmto2NRqoJPUo/huJ0kqe8UitFLDXu/8T4Ye/2Gy2TxtOY800N/lc0XIRYnWREtdiiTtdoY6SVKPkU7nAXe8lCT1Lm6UIkmSeoVoNKAht418TRqq8kSjO9tiR5LKj6FOkiT1eNFoQENqMx998WKO/csZfHbVlTRXNhJLGuwklT9DnSRJ6vHSqTY+9MLnWdb8DLkwz8ONT/DZV75Oa6K51KVJUqcZ6iRJUo+XDbKszq7v0La0+Wmaii0kEm6eIqm8GeokSVKPlwjiVEcqO7QNS+xFtuhBhpLKn6FOkiT1eIlMiu+MvIJUJAlAbbSaa/b5IoNjA8hmPf5CUnnzSANJEpFIQLHocczqucJswJGpw3jokF+xOb+VZJCgNlJDrC1JsdTFSVInGeokqRcrJHOkY228llnLyMph1OaSRNuaKRayBLEExaSzGOpB0lH2rqmnOt0HCMlmCwY6ST2CoU6SeqtkkT+0LuKyVdcDcP2gizhufQWLb/gYmaYG6oaPZtwVtxGrHEI+37mPvtFohGiYhiBCMUhQKISEoTODKo1s1gPqJfUsXlMn7QGxWIREIkYk4vlH6r6y8TRfX/Pd9tsTK9/BQzOnkGlqAGDrqmd57DufIJbf2qnXiZMhu/4pln7vPJb/96cIt75M0PgKKZocI5Ik7QaGOmk3CIKA6mwrdY2b6Nu4keo1K4nc8G2qX3iWqnxbqcuTdiofFkgXMwBEiJBvbaKQTXd4zMYXlhCEuz6rEYkEhK0baVz1V4YedQqtDev4/UVHU8y2suiq00mGjZ36HSRJkqFO2i1qs820feijNIw6hM2jDqXpoktJnjCWzJxbiDzzDB6BpO4oEcY5tOpAAIoUiVbWEEt23PJ90OjjCIP4rr8GLWxc8SjP3v4dnr/rR+w//uOMnvx51i67h0RNX15ffh+xmG9FkiR1hu+kUifF41HyC+8ht+jh9rbcAw9SeOU1IoMGkL3j91Q1NhCNusxM3UsiXcFPRs7k7P6nMDI1jOXhKsZ99VYq+tYD0H/UEbxz2k3kY3126fmj0YCWdc+z+DufYOtrf6Xhlad4eNZH2Ovw91I9eF/ybc00rX+JaNS3IvVMsXhAWJEnWRXxgHNJe5QbpUidVBHmST+2ZIf23JInCGprafv+TaTnzafvU39ia7KmBBVKO1cshiRbKvnKwM+SGZwhWUyRGhXllBsfh2IBInHy8b67vElKLBrw13t+ukP7mqULOfiML/LKonnsPeZkyDYDqU7+NlL3UqzO8lJuLZVBBb/bdA+VkQqmDDiNVGsVhYJ7bkravfx6VOqESCQgWLuG5Env3eG+5JnvJ3bYIfR9+A9EBg8i98hil5mp2wlDIBMl2VYJmQjpbEg66Es6OoB00KdTu16GBPQZftAO7X33OYSnfv1NjvrEt1iz9G7CQqYTv4HUDSWLrM2vJx1muL3hXsbUHMKgRH8mr/gkmYrWUlcnqQfyE6bUCdFohOzv7yYkpPrbM4kMGkRk8CCqr/0Gkf79SM/9Nds+9HFqf/oDgspK3MFdvUk2W2DkCR+iz9AD2tv6jzqCmsH78NLdPyHbvIUACBO1pStS2gMKiSwLGu5nyvPT+MH6Ofy/F6fzanoN4/uNZUX6JZfjS9rtXH4pdUKhUCT1jqPg9TcorFlLzX9/F8KQzK9vpeXaG+jz65+z9X2nkfvTEyQ+NMUlN+p1MkEt7/vmH9m2egUQkk+38OC1/8m+46aSbmzgbad8ira8H3C1Z6VSMZLJOPl8oUteL0+BmzfM79B28xvz+eUBN5AL83RiQ1lJ2ilDndQJxWII++9PtE8tbed8lLZvf6/D/UF8+66BYTZHOlUFad/J1bsUiyHZaB21w97OX397PRuffZQDT/00o973UfKRatoKLhjRnpWvbWNJ20ssXLOII6sPZlzsWBLRFIU9nO/yYWGH28kgwbDEXoQ5l21I2r0MdVInNcUr6TNgALEjDif/5+Xt7ZEhexG2tEAySeLsyWwz0KmXKhRCikEf3v6Br3FgPg3xalrzQNdMmqgXi9WEzNm0gG+t/QkAt2xawAl9juW6fb5ErLHyX/x0J143F+fUfiewoOG+9rbJ/d9HXayWBAmX4kva7Qx10m7QXFVHza9+QdOUj5D/83KiBx5A7X9/j/S999N32SO0VdeVukSppMIQsoUYBNXg9xvqIq1BGzetn9eh7f5tj5EN83v0A1Ask+Dre3+ed9YcwUONS3hXzRiOqf0PEsSJtyYpYKqTtHsZ6qTdoFAo0tx/EJV3/IZooUAxEiFfWUlw4GiaIrHtyzQlSV2imMyTjrcSI0oQ7HjNZsCevY4zDCHSnOD05HhOG/I+ChQgH0A2SsFpOkl7gBczSLtJoRDSnKhiW0UtTclqWgoR0kQNdJLUhaLxgKfyz/Cuv3yAlek1XFg/tcP9J9YdR6yLvtPO5YrkWyFsjRJmIoQGOkl7iDN1kiSpx0jHW/nKi7MoUmTq819g0aHzOLx6NHc3PMSRNQfzrtoxJJpTXtIpqUfZ5VC3YcMGLr/8cjZs2EAqlWLWrFkMHTqUxsZGLr74YlavXk2/fv248cYbGThw4O6sWZIkaecisDm3FYAcOY57+mzOGXA6Vw3/HGEmoCqRYmOhqcRFStLutcvLLy+99FLGjRvHbbfdxqRJk5g1axYAN954I2PGjGHhwoWcffbZXHPNNbutWEmSpP9LPJ/grAETOrQta/kLW7PNtLbmSlSVJO1ZuzRT19DQwHPPPcfNN98MwJlnnsmxxx4LwKJFi5g7dy4Ap556KldddRW5XI74387rkiRJ2mPSUb5Y/3H2jtdzz7aHOKTyAKbVf5RUW4W7TkrqsXYp1K1evZohQ4Ywc+ZMli5dysCBA/nqV78KbF+W+eZyy1gsRnV1NQ0NDQwePHj3VS1JkvRPRJqTfLjmLM7uM5FEmISWwF0nJfVoQfgvtmJauHAhM2bM6NA2YsQInnjiCX74wx8ybtw45s+fzx133MGcOXM4+OCDWb58ObHY9rx4/PHHc+utt3pdnSRJkiTtAf8y1O3MqlWrmDx5MsuWLQOgra2NY445hqeeeooTTjiBefPmUV9fTz6f5+ijj+bxxx//t5dfbt7c3C22gB84sIaNG72QujuxT7of+6R7sT+6H/uk+7FPuh/7pPuxT0onEgno37/6rf/crrzY8OHDqa+v58EHHwTggQce4KCDDgJg7Nix3HbbbQDcddddjBkzxuvpJElSp0QiAfnqNJmaFrI1LcQq9uwB4pJUTnb5SIPvfe97XHnllVx//fVUV1czc+ZMAC666CKmT5/OxIkTqampad8VU5IkaVdEIpCtbuVLK6/l/m2P0S9WxzUjvsjRlYcTaU2UujxJKrldDnUjR45kzpw5O7TX1dXxox/9qFNFSZIkvSmsKPCddbO5b9tiADbnt/Cpl6/g0UPnU4GhTpJ2+Zw6SZKkrtAWtLG4aVmHtiJFXmh7lVjMjzKS5F9CSZLUraXCFIdVvX2H9v0qhpPPF0tQkSR1L4Y6SZLUrURikK9Os6HiddLVLYTFgC8NvYCDKvcHIBUk+dqwaVQUK0pcqSR1D7t8TZ0kSdLulkxEeCP+BpNXfIot+W0EBHx+yMc4t+9Z3DzqOnLkiAUxkvkktPgxRpLAmTpJktRNpOI5CrFVfPm1b7Elvw2AkJBvr/spLZFWEk2VVDX1IdlYBa0GOkl6k6FOkiR1C5XJFrJtK3glvXqH+zbnthB4NJ0k7ZShTpIklVwkEkD6JWobH+TkPsd0uK86Ukl9fCBhWKLiJKmbc+2CJEm7QSQC8XiUTKZQ6lLKUrEYQsUIKt74KZ8f/QcKFLlz6yOMSO7NtSMuJZFJYaaTpJ0z1EmS1EkVbCPXtJWG1SvoN+oIglglaapLXVbZyRRqSA37GgNWTOSKwR/jC/teQqTiQGLpvuRyhmVJ+mcMdZIkdUIqaOaF3/+Ip391DQBBJMJ7vjyfgYeOL3Fl5aclnaJYex4Vh32QZDFNnEqaWuvIFQx0kvR/8Zo6SZI6IShk+Mv/zGy/HRaLPP7Dz1Js21TCqspXWyZBQ3MdDa31bG2tpVDwcHFJ+lcMdZIkdUIhlyEsdpxJatu6AQLfYiVJXcN3HEmSOiGaqKBu+OgObfsc/wGIxEtUkSSpt/GaOkmSOqGYGsCJV93Fk7+8koaXn2TvI09m9PunkQ6qSl2aJKmXMNRJktQJmUyBaLwfR3zsegqZFmKVfWnJuBBGktR1DHWSJHVSoQCtpCCaIpMpdTWSpN7GrxIlSZIkqYwZ6iRJkiSpjBnqJEmSJKmMeU2dJEldIJUIINNAEAQQSZKhgqLnakuSdgNn6iRJ2sOSQRub/3IPC79wLL89byR//tnFVNBU6rIkST2EoU6SpD0t28Sib55NWCxQU78vrzzwS1bc+V+kEqUuTJLUE7j8UpKkPSgej9Lw/HLe8+X5ROMJ0ts2Ujd8NC/98eeE6S0E0b6EYamrlCSVM0OdJEl7UKFQoP+oI3kfY7BmAAAK4UlEQVT0hvN4/elFAMQraxg/435i8QTpDSuo6DeEfLSGfNEFNJKkt85QJ0nSHhXQ1rCuPdAB5Fqb+Mv/zKDvyMNYPudKYqkqTvrmfcTrDyafd/cUSdJb41eCkiTtQUEQ0Lp14w7trZvXEUtUAJBPt7D4ux8nktvW1eVJknoAQ50kSXtQoVCk/9uOJJaq6tC+73vOYe2ye9pvN617iQheXCdJeutcfilJ0h6Wj9Zyyo1LeHL2l2nbsp4DTrmAWLKS9cvva3/MiOPOohBJYa6TJL1VhjpJkvawfCEgUj2cMZ/+CbFCnsiDaTKjm9nr8Pey9bW/MvTIU/mPj1xFK55xIEl66wx1kiR1gWIxJEuSiqCK9O9eIXpvjONO/i7hhAjBCwXibTWEFU7TSZLeOkOdJEldKBsLiY8bRG7BOsLZ6e2N/ROEk6JAvqS1SZLKk6FOkqQu1FYo0OfMoVAZo/DYJoJhFSQ+si/N0QIUSl2dJKkcGeokSepi24o5khMHEX/fIIoxaKRIseDSS0nSrjHUSZJUApl8gUwE8KxxSVIneU6dJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJUxQ50kSZIklTFDnSRJkiSVMUOdJEmSJJWxWKkL+N8ikaDUJbTrTrVoO/uk+7FPuhf7o/uxT7of+6T7sU+6H/ukNHb13z0IwzDczbVIkiRJkrqIyy8lSZIkqYwZ6iRJkiSpjBnqJEmSJKmMGeokSZIkqYwZ6iRJkiSpjBnqJEmSJKmMGeokSZIkqYwZ6iRJkiSpjBnqJEmSJKmMGer+wZo1a5g6dSqTJk3iwx/+MGvXrgUgm81yySWXMGHCBCZPnszLL79c4kp7jw0bNvDJT36S97///UyZMoU1a9YA0NjYyCc/+UkmTJjA1KlT2bhxY4kr7X2effZZDj744PbbjpPSWbZsGWeddRaTJk3i3HPPbf/b5TgprQULFnDKKadw0kknMXfu3FKX0yt9//vfZ+LEiUycOJHrrrsOgMWLF3Paaadx0kknccMNN5S4wt7r2muvZfr06QCsWLGCM844g/Hjx/OVr3yFfD5f4up6l/vvv58zzjiDCRMmcPXVVwOOk7IUqt3FF18czp07NwzDMPzFL34RfvGLXwzDMAx/8pOfhF/96lfDMAzDJUuWhGeffXbJauxtzj333HDevHlhGIbhvHnzwosuuigMwzD8+te/Ht50001hGIbh7373u/Z2dY3W1tZwypQp4f7779/e5jgpnXHjxoUrVqwIwzAM58+fH15wwQVhGDpOSun1118Px40bF27ZsiVsaWkJTzvttPDFF18sdVm9yqOPPhp+8IMfDDOZTJjNZsOPfOQj4YIFC8KxY8eGq1atCnO5XPixj30sXLRoUalL7XUWL14cvuMd7wi/9KUvhWEYhhMnTgyffPLJMAzD8LLLLmv/LKY9b9WqVeG73vWucP369WE2mw3POeeccNGiRY6TMuRM3T8oFos0NzcD0NbWRiqVAmDRokWcfvrpABx11FE0NDSwbt26ktXZWzQ0NPDcc88xZcoUAM4880w+97nPAdv75LTTTgPg1FNP5aGHHiKXy5Ws1t5m5syZnHvuuR3aHCelkc1mueiiizjwwAMBOOCAA1i/fj3gOCmlxYsXc8wxx1BXV0dlZSXjx4/n7rvvLnVZvcrAgQOZPn06iUSCeDzOfvvtx8qVKxkxYgTDhg0jFotx2mmn2S9dbOvWrdxwww1ccMEFAKxdu5Z0Os3hhx8OwBlnnGGfdKF7772XU045hfr6euLxODfccAMVFRWOkzJkqPsHF110EbNnz+b444/nZz/7GZ/4xCeA7UsABw4c2P64gQMH8vrrr5eqzF5j9erVDBkyhJkzZ3LmmWcybdo04vE40LFPYrEY1dXVNDQ0lLLcXuO+++4jnU5z8sknd2h3nJRGIpFg0qRJwPYvpr7//e9z4oknAo6TUvrf42HQoEG88cYbJayo93nb297WHhRWrlzJwoULCYLAfimxK664gs9//vPU1tYCO3/vsE+6zmuvvUahUOCCCy5g0qRJzJs3z79fZSpW6gJKYeHChcyYMaND28iRI8lkMlx11VWceOKJ3HPPPXzmM5/hjjvuIAxDgiBof2wYhkQi5uHdaWd9MmLECJ599lk++9nPctlllzF//nymT5/OnDlzdvh5+2T3+2fjpLm5mdmzZ+/weMfJnvfP+mT27Nlks1mmT59OPp/n/PPP3+nP2yddp1gs7jAe/vG2us6LL77I+eefz6WXXko0GmXlypXt99kvXWv+/PnstddeHHvssdx6662AY6XUCoUCS5cuZc6cOVRWVnLhhReSSqXskzLUK0PdhAkTmDBhQoe2hoYGJkyY0P4N9/jx47nyyivZsmULgwcPZsOGDQwfPhyATZs2MWjQoC6vuyfbWZ+sWrWKyZMnM27cOGD78rE3L+AdNGgQmzZtor6+nnw+T0tLC3V1dV1ed0+2sz6ZP38+N910E1OnTm1vmzRpEnPnznWcdIGd9QlAS0sLF154IXV1dfzwhz9sn9F2nJROfX09S5cubb+9ceNGx0MJLFu2jGnTpvHlL3+ZiRMnsmTJkg4bBtkvXeuuu+5i48aNTJo0iW3bttHa2koQBB36xPeOrjVgwACOPfZY+vXrB8CJJ57I3XffTTQabX+M46Q8+JXt3/Tt25dkMtn+Jrxs2TKqqqro168fY8eO5fbbbwdg6dKlJJNJhgwZUspye4Xhw4dTX1/Pgw8+CMADDzzAQQcdBMDYsWO57bbbgO1vEmPGjGn/IKs95+yzz+aPf/wjt99+e/uYuP3226murnaclNAll1zCiBEjuPHGG0kkEu3tjpPSeec738ljjz1GQ0MDbW1t/OEPf+Dd7353qcvqVdavX8+nP/1pZs2axcSJEwE47LDDePXVV9uXnN155532Sxe6+eabufPOO7n99tuZNm0aJ5xwAjNmzCCZTLJs2TJg+3uKfdJ1xo0bxyOPPEJjYyOFQoGHH36Yk08+2XFShoIwDMNSF9FdPP3003zjG98gnU5TVVXFFVdcwejRo8lkMlxxxRU888wzJBIJrr766vZwoT3rlVdeaZ8xra6uZubMmeyzzz5s3bqV6dOns3r1ampqapg1axZDhw4tdbm9zgEHHMDzzz8P4DgpkWeffZbJkyczatQoYrHtiy8GDRrEj3/8Y8dJiS1YsICbbrqJXC7HWWed1X6dtrrG1VdfzW9/+9v21QMAU6ZMYZ999mHGjBlkMhnGjh3LZZdd5tKyErj11ltZsmQJM2fO5LnnnuPyyy+nubmZgw46iBkzZnT4gkp71m9+8xtmz55NLpfjuOOO4/LLL+fxxx93nJQZQ50kSZIklTGXX0qSJElSGTPUSZIkSVIZM9RJkiRJUhkz1EmSJElSGTPUSZIkSVIZM9RJkiRJUhkz1EmSJElSGTPUSZIkSVIZ+/84v7zHbtSnrwAAAABJRU5ErkJggg==\n", "text/plain": [ - "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" + "
" ] }, - "execution_count": 66, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "string.punctuation" + "# sns settings\n", + "sns.set(rc={'figure.figsize':(15,15)})\n", + "\n", + "# colors\n", + "palette = sns.color_palette(\"bright\", len(set(y_pred)))\n", + "\n", + "# plot\n", + "sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)\n", + "plt.title(\"t-SNE Covid-19 Articles - Clustered\")\n", + "# plt.savefig(\"plots/t-sne_covid19_label.png\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PySpark K-Means Model" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 441, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "removed all punctuation from all abstracts. time taken:0:00:00.588561\n" - ] - } - ], + "outputs": [], "source": [ - "start_time= datetime.now()\n", - "df_covid2['abstract'] = df_covid2['abstract'].apply(lambda x: str(x).translate(translator))\n", - "df_covid2.head()\n", - "end_time=datetime.now()\n", - "print(f'removed all punctuation from all abstracts. time taken:{end_time-start_time}')" + "from pyspark.mllib.clustering import KMeans, KMeansModel" + ] + }, + { + "cell_type": "code", + "execution_count": 442, + "metadata": {}, + "outputs": [], + "source": [ + "model = KMeans.train(sc.parallelize(X_train),\n", + " k,\n", + " maxIterations=10,\n", + " initializationMode=\"random\",\n", + " seed=50,\n", + " initializationSteps=5,\n", + " epsilon=1e-4)" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 459, + "metadata": {}, + "outputs": [], + "source": [ + "from math import sqrt\n", + "def error(point):\n", + " center = model.centers[model.predict(point)]\n", + " return sqrt(sum([x**2 for x in (point - center)]))" + ] + }, + { + "cell_type": "code", + "execution_count": 465, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "removed all punctuation from all body_texts. time taken:0:00:26.866739\n" + "Within Set Sum of Squared Error = 502.9182715894793\n" ] } ], "source": [ - "start_time= datetime.now()\n", - "df_covid2['body_text'] = df_covid2['body_text'].apply(lambda x: str(x).translate(translator))\n", - "df_covid2.head()\n", - "end_time=datetime.now()\n", - "print(f'removed all punctuation from all body_texts. time taken:{end_time-start_time}')" + "WSSSE = sc.parallelize(X_train).map(lambda point: error(point)).reduce(lambda x, y: x + y)\n", + "print(\"Within Set Sum of Squared Error = \" + str(WSSSE))" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 479, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abstract
0middle east respiratory syndrome coronavirus m...
1in theoretical physics there exist two basic m...
2background the aim of this research was to eva...
3introduction the major incident hospital mih i...
4mice infected with the neurotropic jhm strain ...
\n", - "
" - ], "text/plain": [ - " abstract\n", - "0 middle east respiratory syndrome coronavirus m...\n", - "1 in theoretical physics there exist two basic m...\n", - "2 background the aim of this research was to eva...\n", - "3 introduction the major incident hospital mih i...\n", - "4 mice infected with the neurotropic jhm strain ..." + "[array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.]),\n", + " array([0., 0., 0., ..., 0., 0., 0.])]" ] }, - "execution_count": 69, + "execution_count": 479, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "abstracts = df_covid2.drop([\"paper_id\",\"body_text\", \"abstract_word_count\", \"body_word_count\", \"authors\", \"title\"], axis=1)\n", - "abstracts.head()" + "model.clusterCenters" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 487, "metadata": {}, "outputs": [ { @@ -5157,149 +4153,292 @@ " \n", " \n", " \n", - " body_text\n", + " 0\n", + " 1\n", + " 2\n", + " 3\n", + " 4\n", + " 5\n", + " 6\n", + " 7\n", + " 8\n", + " 9\n", + " ...\n", + " 4086\n", + " 4087\n", + " 4088\n", + " 4089\n", + " 4090\n", + " 4091\n", + " 4092\n", + " 4093\n", + " 4094\n", + " 4095\n", " \n", " \n", " \n", " \n", " 0\n", - " middle east respiratory syndrome coronavirus m...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " \n", " \n", " 1\n", - " in theoretical physics there exist two basic m...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " \n", " \n", " 2\n", - " in view of the role of thinsection ct scanning...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " \n", " \n", " 3\n", - " injury medical management does not simply invo...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " \n", " \n", " 4\n", - " 1 frequently change andor sterilize gloves thr...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " \n", + " \n", + " 5\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " \n", + " \n", + " 6\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " \n", + " \n", + " 7\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " \n", + " \n", + " 8\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " \n", " \n", "\n", + "

9 rows × 4096 columns

\n", "" ], "text/plain": [ - " body_text\n", - "0 middle east respiratory syndrome coronavirus m...\n", - "1 in theoretical physics there exist two basic m...\n", - "2 in view of the role of thinsection ct scanning...\n", - "3 injury medical management does not simply invo...\n", - "4 1 frequently change andor sterilize gloves thr..." + " 0 1 2 3 4 5 6 7 8 9 ... 4086 \\\n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", + "\n", + " 4087 4088 4089 4090 4091 4092 4093 4094 4095 \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[9 rows x 4096 columns]" ] }, - "execution_count": 70, + "execution_count": 487, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bodytexts = df_covid2.drop([\"paper_id\",\"abstract\", \"abstract_word_count\", \"body_word_count\", \"authors\", \"title\"], axis=1)\n", - "bodytexts.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.tokenize import word_tokenize \n", - "from nltk.corpus import stopwords\n", - "\n", - "#stopwords = nltk.corpus.stopwords.words('english')\n", - "\n", - "def normalize_document(doc):\n", - "\n", - " # lower case and remove special characters\\whitespaces\n", - "\n", - " doc = str(doc).strip()\n", - " doc = str(doc).lower()\n", - " \n", - " translator = str.maketrans(' ',' ',string.punctuation) #where string.punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n", - " doc = str(doc).translate(translator)\n", - "\n", - " # tokenize document\n", - "\n", - " stop_words = set(stopwords.words('english')) \n", - " word_tokens = word_tokenize(doc) \n", - " filtered_text = [word for word in word_tokens if word not in stop_words] \n", - " return filtered_text" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'x' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mabstracts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mabstracts\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'abstract'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnormalize_document\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'x' is not defined" - ] - } - ], - "source": [ - "abstracts = abstracts['abstract'].apply(normalize_document(x))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Forming n-grams" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[ref](https://stackoverflow.com/questions/21844546/forming-bigrams-of-words-in-list-of-sentences-with-python)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk import word_tokenize\n", - "from nltk.util import ngrams" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for i in ngrams(abstracts['abstract'][0].split(\" \"),2):\n", - " print(i)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Word Cloud of Abstracts" + "cluster_centers_pandas = pd.DataFrame(model.clusterCenters)\n", + "cluster_centers_pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this section we make a word cloud of the abstracts we have got. [ref](https://www.datacamp.com/community/tutorials/wordcloud-python)" + "# Find top N closest words" ] }, { @@ -5308,87 +4447,72 @@ "metadata": {}, "outputs": [], "source": [ - "import wordcloud as wc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start with one review:\n", + "from pyspark import SparkContext\n", + "from pyspark.mllib.feature import Word2Vec\n", "\n", - "text = abstracts['abstract'][0]\n", - "\n", - "# Create and generate a word cloud image:\n", - "wordcloud = wc.WordCloud().generate(text)\n", - "\n", - "# Display the generated image:\n", - "plt.imshow(wordcloud, interpolation='bilinear')\n", - "plt.axis(\"off\")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Target Column:\n", - "print(f\"Total number of abstracts: {len(abstracts['abstract'])}\")\n", - "print(f\"Number of nan abstracts: {abstracts['abstract'].value_counts()['nan']}\")" + "sc = SparkContext()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 381, "metadata": {}, "outputs": [], "source": [ - "abstracts = abstracts.loc[~(abstracts['abstract'] == 'nan')]" + "#Generate doc\n", + "localDoc = abstracts['abstract']\n", + "doc = sc.parallelize(localDoc).map(lambda line: line.split(\" \"))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 370, "metadata": {}, "outputs": [], "source": [ - "## Target Column:\n", - "print(f\"Total number of abstracts: {len(abstracts['abstract'])}\")" + "# Learn a mapping from words to Vectors.\n", + "model = Word2Vec().fit(doc)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 400, "metadata": {}, "outputs": [], "source": [ - "start_time = datetime.now()\n", - "wordcloud2 = wc.WordCloud(width=1920,height=1080).generate(' '.join(abstracts['abstract']))\n", - "end_time=datetime.now()\n", - "print(f'generated wordclouds from all {len(abstracts)} abstracts. time taken:{end_time-start_time}')" + "word = \"corona\"\n", + "topN = 10\n", + "synonyms = model.findSynonyms(word, topN)\n", + "result = [(s[0], s[1]) for s in synonyms]" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot = plt.figure(figsize=(60, 60))\n", - "plt.imshow(wordcloud2)\n", - "plt.axis(\"off\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", + "execution_count": 401, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('caused', 0.8772739171981812),\n", + " ('named', 0.8559650778770447),\n", + " ('causing', 0.833193838596344),\n", + " ('background', 0.8326801061630249),\n", + " ('coronavirus', 0.7966358661651611),\n", + " ('became', 0.781213104724884),\n", + " ('2019ncov', 0.7766153812408447),\n", + " ('disease2019', 0.7706559896469116),\n", + " ('emerged', 0.768865704536438),\n", + " ('abstractthe', 0.76090407371521)]" + ] + }, + "execution_count": 401, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Forming word Vectors" + "result" ] }, { @@ -5396,55 +4520,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from sklearn import Tf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dimensionality Reduction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Clustering " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Ref - Scikit learn](https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Visualization and stuff" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# References" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. [COVID-19 Open Research Dataset Challenge (CORD-19)](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge/)\n", - "2. [How to build a topic-based search engine](https://www.smithinst.co.uk/insights/build-topic-based-search-engine/)\n", - "3. W. Buntine et al., \"A Scalable Topic-Based Open Source Search Engine,\" IEEE/WIC/ACM International Conference on Web Intelligence (WI'04), Beijing, China, 2004, pp. 228-234.\n", - "4. [Semantic Topic Modeling for Search Queries at Google](https://gofishdigital.com/semantic-topic-modeling/)\n", - "5. GRANT, C.; PAZHAYIDAM GEORGE, C.; KANJILAL, V.; NIRKHIWALE, S.; WILSON, J.; WANG, D.. A Topic-Based Search, Visualization, and Exploration System. Florida Artificial Intelligence Research Society Conference, North America, apr. 2015. Available at: . Date accessed: 25 Apr. 2020." - ] + "source": [] } ], "metadata": { @@ -5463,7 +4539,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.3" }, "toc": { "base_numbering": 1,