<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" media="screen" href="/~d/styles/atom10full.xsl"?><?xml-stylesheet type="text/css" media="screen" href="http://feeds.feedburner.com/~d/styles/itemcontent.css"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:openSearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:blogger="http://schemas.google.com/blogger/2008" xmlns:georss="http://www.georss.org/georss" xmlns:gd="http://schemas.google.com/g/2005" xmlns:thr="http://purl.org/syndication/thread/1.0" gd:etag="W/&quot;A04HQXY-eCp7ImA9WhBbF00.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035</id><updated>2013-05-16T08:12:10.850-05:00</updated><category term="Stata" /><category term="Policy" /><category term="Python" /><category term="1000 genomes" /><category term="Recommended Reading" /><category term="Twitter" /><category term="SQL" /><category term="Visualization" /><category term="Statistics" /><category term="Machine Learning" /><category term="Noteworthy blogs" /><category term="Pathways" /><category term="ggplot2" /><category term="Imputation" /><category term="Perl" /><category term="Sequencing" /><category term="dbGaP" /><category term="Search" /><category term="RNA-Seq" /><category term="Tutorials" /><category term="Announcements" /><category term="PubMed" /><category term="Productivity" /><category term="RSS" /><category term="Conferences" /><category term="Metagenomics" /><category term="GWAS" /><category term="Journal club" /><category term="Linux" /><category term="Clustering" /><category term="Annotation" /><category term="Software" /><category term="Writing" /><category term="Web Apps" /><category term="Databases" /><category term="Ethics" /><category term="PLINK" /><category term="News" /><category term="ENCODE" /><category term="Bioinformatics" /><category term="R" /><title>Getting Genetics Done</title><subtitle type="html">Getting Things Done in Genetics &amp;amp; Bioinformatics Research</subtitle><link rel="http://schemas.google.com/g/2005#feed" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/posts/default" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/" /><link rel="next" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default?start-index=26&amp;max-results=25&amp;redirect=false&amp;v=2" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><generator version="7.00" uri="http://www.blogger.com">Blogger</generator><openSearch:totalResults>338</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/atom+xml" href="http://feeds.feedburner.com/GettingGeneticsDone" /><feedburner:info xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0" uri="gettinggeneticsdone" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><link rel="license" type="text/html" href="http://creativecommons.org/licenses/by-sa/3.0/" /><feedburner:emailServiceId xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">GettingGeneticsDone</feedburner:emailServiceId><feedburner:feedburnerHostname xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">http://feedburner.google.com</feedburner:feedburnerHostname><entry gd:etag="W/&quot;AkADRHw6fCp7ImA9WhBbFk8.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-5040082614815539483</id><published>2013-05-15T09:39:00.000-05:00</published><updated>2013-05-15T09:39:35.214-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-15T09:39:35.214-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="RNA-Seq" /><category scheme="http://www.blogger.com/atom/ns#" term="Metagenomics" /><category scheme="http://www.blogger.com/atom/ns#" term="Conferences" /><category scheme="http://www.blogger.com/atom/ns#" term="Visualization" /><category scheme="http://www.blogger.com/atom/ns#" term="Twitter" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Automated Archival and Visual Analysis of Tweets Mentioning #bog13, Bioinformatics, #rstats, and Others</title><content type="html">&lt;b&gt;&lt;span style="font-size: large;"&gt;Automatically Archiving Twitter Results&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
Ever since &lt;a href="http://lifehacker.com/5937648/whys-everybody-so-pissed-about-twitter-and-should-i-care" target="_blank"&gt;Twitter gamed its own API&lt;/a&gt; and killed off great services like &lt;a href="http://techcrunch.com/2012/09/20/ifttt-is-the-latest-service-to-be-affected-by-twitters-api-constraints-will-remove-triggers/" target="_blank"&gt;IFTTT triggers&lt;/a&gt;, I've been looking for a way to automatically archive tweets containing certain search terms of interest to me. Twitter's built-in search is limited, and I wanted to archive interesting tweets for future reference and to start playing around with some basic text / trend analysis.&lt;br /&gt;
&lt;br /&gt;
Enter &lt;a href="https://github.com/sferik/t" target="_blank"&gt;&lt;b&gt;&lt;span style="font-family: Courier New, Courier, monospace;"&gt;t&lt;/span&gt;&lt;/b&gt; - the twitter command-line interface&lt;/a&gt;. t is a command-line power tool for doing all sorts of powerful Twitter queries using the command line. See &lt;a href="https://github.com/sferik/t/blob/master/README.md" target="_blank"&gt;&lt;b&gt;&lt;span style="font-family: Courier New, Courier, monospace;"&gt;t&lt;/span&gt;&lt;/b&gt;'s documentation&lt;/a&gt; for examples.&lt;br /&gt;
&lt;br /&gt;
I wrote &lt;a href="https://github.com/stephenturner/twitterchive/blob/master/twitterchive.sh" target="_blank"&gt;this script&lt;/a&gt;&amp;nbsp;that uses the t utility to search Twitter separately for a set of specified keywords, and append those results to a file. The comments at the end of the script also show you how to commit changes to a git repository, push to GitHub, and automate the entire process to run twice a day with a cron job. Here's the code as of May 14, 2013:&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/stephenturner/5579435.js"&gt;&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
That script, and results for searching for "&lt;a href="https://raw.github.com/stephenturner/twitterchive/master/bioinformatics.txt" target="_blank"&gt;bioinformatics&lt;/a&gt;", "&lt;a href="https://github.com/stephenturner/twitterchive/blob/master/metagenomics.txt" target="_blank"&gt;metagenomics&lt;/a&gt;", "&lt;a href="https://raw.github.com/stephenturner/twitterchive/master/rstats.txt" target="_blank"&gt;#rstats&lt;/a&gt;", "&lt;a href="https://github.com/stephenturner/twitterchive/blob/master/rna-seq.txt" target="_blank"&gt;rna-seq&lt;/a&gt;", and "&lt;a href="https://github.com/stephenturner/twitterchive/blob/master/bog13.txt" target="_blank"&gt;#bog13&lt;/a&gt;" (the Biology of Genomes 2013 meeting) are all in the GitHub repository below. (Please note that these results update dynamically, and searching Twitter at any point could possibly result in returning some unsavory Tweets.)&lt;br /&gt;
&lt;br /&gt;
&lt;a href="https://github.com/stephenturner/twitterchive" target="_blank"&gt;https://github.com/stephenturner/twitterchive&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;&lt;span style="font-size: large;"&gt;Analyzing Tweets using R&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
You'll also find an &lt;a href="https://github.com/stephenturner/twitterchive/tree/master/analysis" target="_blank"&gt;analysis subdirectory&lt;/a&gt;, containing some &lt;a href="https://github.com/stephenturner/twitterchive/blob/master/analysis/twitterchive.r" target="_blank"&gt;R code&lt;/a&gt; to produce barplots showing the number of tweets per day over the last month, frequency of tweets by hour of the day, the most used hashtags within a search, the most prolific tweeters, and a ubiquitous word cloud. Much of this code is inspired by &lt;a href="http://nsaunders.wordpress.com/2012/08/16/twitter-coverage-of-the-ismb-2012-meeting-some-statistics/" target="_blank"&gt;Neil Saunders's analysis of Tweets from ISMB 2012&lt;/a&gt;. Here's the code as of May 14, 2013:&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/stephenturner/5579421.js"&gt;&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
Also in that &lt;a href="https://github.com/stephenturner/twitterchive/tree/master/analysis" target="_blank"&gt;analysis directory&lt;/a&gt; you'll see periodically updated plots for the results of the queries above.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;&lt;span style="font-size: large;"&gt;Analyzing Tweets mentioning "bioinformatics"&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
Using the bioinformatics query, here are the number of tweets per day over the last month:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-jIw_d2YBt4Y/UZKkorF0XOI/AAAAAAABEOI/yMdpWSDJqBw/s1600/bioinformatics--barplot-tweets-by-date.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://3.bp.blogspot.com/-jIw_d2YBt4Y/UZKkorF0XOI/AAAAAAABEOI/yMdpWSDJqBw/s640/bioinformatics--barplot-tweets-by-date.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Here is the frequency of "bioinformatics" tweets by hour:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-1PDCsLa_mhk/UZKkpHDF6rI/AAAAAAABEOQ/F8O77jwH2fE/s1600/bioinformatics--barplot-tweets-by-hour.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://3.bp.blogspot.com/-1PDCsLa_mhk/UZKkpHDF6rI/AAAAAAABEOQ/F8O77jwH2fE/s640/bioinformatics--barplot-tweets-by-hour.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
Here are the most used hashtags (other than #bioinformatics):&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-oTFrkEQWlb0/UZKkon2bNrI/AAAAAAABEOE/kDYuMBTzhRo/s1600/bioinformatics--barplot-top-hashtags.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://4.bp.blogspot.com/-oTFrkEQWlb0/UZKkon2bNrI/AAAAAAABEOE/kDYuMBTzhRo/s640/bioinformatics--barplot-top-hashtags.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
Here are the most prolific bioinformatics Tweeps:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-LNO03JaPfeU/UZKkohwyz2I/AAAAAAABEOM/SRPPJyTb-mU/s1600/bioinformatics--barplot-top-users.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://4.bp.blogspot.com/-LNO03JaPfeU/UZKkohwyz2I/AAAAAAABEOM/SRPPJyTb-mU/s640/bioinformatics--barplot-top-users.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
Here's a wordcloud for all the bioinformatics Tweets since March:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-giXfBt3RnFo/UZKkpjy7QiI/AAAAAAABEOk/DF1Z-yxmbYw/s1600/bioinformatics--wordcloud.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://4.bp.blogspot.com/-giXfBt3RnFo/UZKkpjy7QiI/AAAAAAABEOk/DF1Z-yxmbYw/s640/bioinformatics--wordcloud.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;b&gt;&lt;span style="font-size: large;"&gt;Analyzing Tweets mentioning "#bog13"&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
The &lt;a href="http://meetings.cshl.edu/meetings/2013/genome13.shtml" target="_blank"&gt;2013 CSHL Biology of Genomes Meeting&lt;/a&gt; took place May 7-11, 2013. I searched and archived Tweets mentioning #bog13 from May 1 through May 14 using this script. You'll notice in the code above that I'm no longer archiving this hashtag. I probably need a better way to temporarily add keywords to the search, but I haven't gotten there yet.&lt;br /&gt;
&lt;br /&gt;
Here are the number of Tweets per day during that period. Tweets clearly peaked a couple days into the meeting, with follow-up commentary trailing off quickly after the meeting ended.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-DMKRR7dBYGI/UZKkqLZ0U-I/AAAAAAABEOo/PNifpXzLbd4/s1600/bog13--barplot-tweets-by-date.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://3.bp.blogspot.com/-DMKRR7dBYGI/UZKkqLZ0U-I/AAAAAAABEOo/PNifpXzLbd4/s640/bog13--barplot-tweets-by-date.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Here is the frequency frequency of Tweets by hour, clearly bimodal:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-7hhNb6a3fy0/UZLUxfcqQPI/AAAAAAABEPg/-FUh8CfLTWw/s1600/bog13--barplot-tweets-by-hour.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://4.bp.blogspot.com/-7hhNb6a3fy0/UZLUxfcqQPI/AAAAAAABEPg/-FUh8CfLTWw/s640/bog13--barplot-tweets-by-hour.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
Top hashtags (other than #bog13). Interestingly #bog14 was the most highly used hashtag, so I'm guessing lots of folks are looking forward to next years' meeting. Also, #ashg12 got lots of mentions, presumably because someone presented updated work from last years' ASHG meeting.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-3VJZqgUwfDU/UZLUxPhMykI/AAAAAAABEPY/mG5IxV_WJhg/s1600/bog13--barplot-top-hashtags.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://3.bp.blogspot.com/-3VJZqgUwfDU/UZLUxPhMykI/AAAAAAABEPY/mG5IxV_WJhg/s640/bog13--barplot-top-hashtags.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
Here were the most prolific Tweeps - many of the usual suspects here, as well as a few new ones (new to me at least):&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-LTVaK3ZP-bY/UZLUxUOENAI/AAAAAAABEPc/ZDDu6MJFyEA/s1600/bog13--barplot-top-users.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="448" src="http://1.bp.blogspot.com/-LTVaK3ZP-bY/UZLUxUOENAI/AAAAAAABEPc/ZDDu6MJFyEA/s640/bog13--barplot-top-users.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
And finally, the requisite wordcloud:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-0gsImWLYXmA/UZLUyj0oZ0I/AAAAAAABEPw/ydZ8TtgFUaw/s1600/bog13--wordcloud.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://1.bp.blogspot.com/-0gsImWLYXmA/UZLUyj0oZ0I/AAAAAAABEPw/ydZ8TtgFUaw/s640/bog13--wordcloud.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;b&gt;&lt;span style="font-size: large;"&gt;More analysis&lt;/span&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
If you look in the &lt;a href="https://github.com/stephenturner/twitterchive/tree/master/analysis" target="_blank"&gt;analysis directory of the repo&lt;/a&gt; you'll find plots like these for other keywords (&lt;i&gt;#rstats&lt;/i&gt;, &lt;i&gt;metagenomics&lt;/i&gt;, &lt;i&gt;rna-seq&lt;/i&gt;, and others to come). I would also like to do some sentiment analysis as Neil did in the ISMB post referenced above, but the &lt;i&gt;sentiment&lt;/i&gt;&amp;nbsp;package has since been removed from CRAN. I hear there are other packages for polarity analysis, but I haven't yet figured out how to use them. I've given you the code to do the mundane stuff (parsing the fixed-width files from &lt;b&gt;&lt;span style="font-family: Courier New, Courier, monospace;"&gt;t&lt;/span&gt;&lt;/b&gt;, for starters). I'd love to see someone take a stab at some further text mining / polarity / sentiment analysis!&lt;br /&gt;
&lt;br /&gt;
&lt;a href="https://github.com/stephenturner/twitterchive" target="_blank"&gt;twitterchive - archive and analyze results from a Twitter search&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/Klow7npj7t8" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/5040082614815539483/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/05/automated-analysis-tweets-bioinformatics-twitterchive.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/5040082614815539483?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/5040082614815539483?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/05/automated-analysis-tweets-bioinformatics-twitterchive.html" title="Automated Archival and Visual Analysis of Tweets Mentioning #bog13, Bioinformatics, #rstats, and Others" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-jIw_d2YBt4Y/UZKkorF0XOI/AAAAAAABEOI/yMdpWSDJqBw/s72-c/bioinformatics--barplot-tweets-by-date.png" height="72" width="72" /><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;CkABQHs_fyp7ImA9WhBUGEs.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-6356553280857629267</id><published>2013-05-06T12:19:00.000-05:00</published><updated>2013-05-06T12:19:11.547-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-05-06T12:19:11.547-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Metagenomics" /><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Three Metagenomics Papers for You</title><content type="html">A handful of good metagenomics papers have come out over the last few months. Below I've linked to and copied my evaluation of each of these articles from F1000.&lt;br /&gt;
&lt;br /&gt;
...&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/23450659" target="_blank"&gt;1.&amp;nbsp;Willner, Dana, and Philip Hugenholtz. "From deep sequencing to viral tagging: Recent advances in viral metagenomics." &lt;i&gt;BioEssays&lt;/i&gt; (2013).&amp;nbsp;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://f1000.com/prime/contributor/evaluate/article/717988265" target="_blank"&gt;My evaluation&lt;/a&gt;: This review lays out some of the challenges and recent advances in viral metagenomic sequencing. There is a good discussion of library preparation and how that affects downstream sequencing. Alarmingly, they reference &lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/21926223" target="_blank"&gt;another paper&lt;/a&gt; that showed that different amplification methods resulted in detection of a completely different set of viruses (dsDNA viruses with LASL, ssDNA with MDA). The review also discusses many of the data management, analysis, and bioinformatics challenges associated with viral metagenomics.&lt;br /&gt;
&lt;br /&gt;
...&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/23571589" target="_blank"&gt;2. Loman, Nicholas J., et al. "A Culture-Independent Sequence-Based Metagenomics Approach to the Investigation of an Outbreak of Shiga-Toxigenic Escherichia coli O104: H4Outbreak of Shiga-toxigenic Escherichia coli." &lt;i&gt;JAMA&lt;/i&gt; 309.14 (2013): 1502-1510.&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://f1000.com/prime/contributor/evaluate/article/718002711" target="_blank"&gt;My evaluation&lt;/a&gt;:&amp;nbsp;This paper is a groundbreaking exploration of the use of metagenomics to investigate and determine the causal organism of an infectious disease outbreak. The authors retrospectively collected fecal samples from symptomatic patients from the 2011 Escherichia coli O104:H4 outbreak in Germany and performed high-throughput shotgun sequencing, followed by a sophisticated analysis to determine the outbreak's causal organism. The analysis included comparing genetic markers from many symptomatic patients' metagenomes with those of healthy controls, followed by de novo assembly of the outbreak strain from the shotgun metagenomic data. This illustrates both the power, but the real limitations, of using metagenomic approaches for clinical diagnostics. Also see David Relman's synopsis of the study in the same JAMA issue&lt;br /&gt;
&lt;br /&gt;
...&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/23387867" target="_blank"&gt;3. Shakya, Migun, et al. "Comparative metagenomic and rRNA microbial diversity characterization using archaeal and bacterial synthetic communities." &lt;i&gt;Environmental microbiology&lt;/i&gt; (2013).&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://f1000.com/prime/contributor/evaluate/article/718002709" target="_blank"&gt;My evaluation&lt;/a&gt;:&amp;nbsp;This study set out to compare shotgun metagenomic sequencing to 16S rRNA amplicon sequencing to determine the taxonomic and abundance profiles of mixed community metagenomic samples. Thus far, benchmarking metagenomic methodology has been difficult due to the lack of datasets where the underlying ground truth is known. In this study, the researchers constructed synthetic metagenomic communities consisting of 64 laboratory mixed genome DNAs of known sequence and polymerase chain reaction (PCR)-validated abundance. The researchers then compared metagenomic and 16S amplicon sequencing, using both 454 and Illumina technology, and found that metagenomic sequencing outperformed 16S sequencing in quantifying community composition. The synthetic metagenomes constructed here are publicly available (Gene Expression Omnibus [GEO] accession numbers are given in the manuscript), which represent a great asset to other researchers developing methods for amplicon-based or metagenomic approaches to sequence classification, diversity analysis, and abundance estimation.&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/8rRPDnuTsWA" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/6356553280857629267/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/05/three-metagenomics-papers-for-you.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6356553280857629267?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6356553280857629267?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/05/three-metagenomics-papers-for-you.html" title="Three Metagenomics Papers for You" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;DkMHRXc8eip7ImA9WhBWEEU.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-854262577874454434</id><published>2013-04-04T09:06:00.001-05:00</published><updated>2013-04-04T09:07:14.972-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-04-04T09:07:14.972-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Tutorials" /><category scheme="http://www.blogger.com/atom/ns#" term="Announcements" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>List of Bioinformatics Workshops and Training Resources</title><content type="html">I frequently get asked to recommend workshops or online learning resources for bioinformatics, genomics, statistics, and programming. I &lt;a href="http://stephenturner.us/p/edu" target="_blank"&gt;compiled a list&lt;/a&gt; of both online learning resources and in-person workshops (preferentially highlighting those where workshop materials are freely available online):&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://stephenturner.us/p/edu" target="_blank"&gt;&lt;b&gt;List of Bioinformatics Workshops and Training Resources&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
I hope to keep the page above as up-to-date as possible. Below is a snapshop of what I have listed as of today. Please leave a comment if you're aware of any egregious omissions, and I'll update the page above as appropriate.&lt;br /&gt;
&lt;br /&gt;
&lt;i&gt;From &lt;a href="http://stephenturner.us/p/edu"&gt;http://stephenturner.us/p/edu&lt;/a&gt;, April 4, 2013&lt;/i&gt;&lt;br /&gt;
&lt;b&gt;&lt;i&gt;&lt;br /&gt;&lt;/i&gt;&lt;/b&gt;
&lt;b&gt;&lt;i&gt;In-Person Workshops:&lt;/i&gt;&lt;/b&gt;&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;Cold Spring Harbor Courses:&amp;nbsp;&lt;/b&gt;&lt;a href="http://meetings.cshl.edu/courses.html" target="_blank"&gt;meetings.cshl.edu/courses.html&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
Cold Spring Harbor has been offering advanced workshops and short courses in the life sciences for years. Relevant workshops include&amp;nbsp;&lt;a href="http://meetings.cshl.edu/courses/2013/c-seqtech13.shtml" title="2013  CSHL course in Advanced Sequencing Technologies &amp;amp; Applications"&gt;Advanced Sequencing Technologies &amp;amp; Applications&lt;/a&gt;,&amp;nbsp;&lt;a href="http://meetings.cshl.edu/courses/2013/c-ecg13.shtml" title="2013  CSHL course on Computational &amp;amp; Comparative Genomics"&gt;Computational &amp;amp; Comparative Genomics&lt;/a&gt;,&amp;nbsp;&lt;a href="http://meetings.cshl.edu/courses/2013/c-info13.shtml"&gt;Programming for Biology&lt;/a&gt;,&amp;nbsp;&lt;a href="http://meetings.cshl.edu/courses/2013/c-data13.shtml" title="2013 CSHL course on Statistical Methods for Functional Genomics"&gt;Statistical Methods for Functional Genomics&lt;/a&gt;, the&amp;nbsp;&lt;a href="http://meetings.cshl.edu/courses/2013/tgac13.shtml" title="The Genome Access Course (TGAC) 2013"&gt;Genome Access Course&lt;/a&gt;, and others. Unlike most of the others below, you won't find material from past years' CSHL courses available online.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;Canadian Bioinformatics Workshops:&amp;nbsp;&lt;/b&gt;&lt;a href="http://bioinformatics.ca/workshops" target="_blank"&gt;bioinformatics.ca/workshops&lt;/a&gt;&lt;br /&gt;
Bioinformatics.ca through its Canadian Bioinformatics Workshops (CBW) series began offering one and two week short courses in bioinformatics, genomics and proteomics in 1999. The more recent workshops focus on training researchers using advanced high-throughput technologies on the latest approaches being used in computational biology to deal with the new data. Course material from&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012" title="Past Workshops - 2012 | Bioinformatics.ca"&gt;past workshops&lt;/a&gt;&amp;nbsp;is freely available online, including both audio/video lectures and slideshows. Topics include&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/microarray-data-analysis" title="Microarray Data Analysis | Bioinformatics.ca"&gt;microarray analysis&lt;/a&gt;,&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/informatics-high-throughput-sequencing-data" title="Informatics on High Throughput Sequencing Data | Bioinformatics.ca"&gt;RNA-seq analysis&lt;/a&gt;, genome rearrangements, copy number alteration,&lt;a href="http://bioinformatics.ca/workshops/2012/pathway-and-network-analysis-omics-data" title="Pathway and Network Analysis of -omics Data  | Bioinformatics.ca"&gt;network/pathway analysis&lt;/a&gt;, genome visualization, gene function prediction, functional annotation, data analysis using R,&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/informatics-and-statistics-metabolomics" title="Informatics and Statistics for Metabolomics | Bioinformatics.ca"&gt;statistics for metabolomics&lt;/a&gt;, and&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/informatics-high-throughput-sequencing-data" title="Informatics on High Throughput Sequencing Data | Bioinformatics.ca"&gt;much more&lt;/a&gt;.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;UC Davis Bioinformatics Training Program:&amp;nbsp;&lt;/b&gt;&lt;a href="http://training.bioinformatics.ucdavis.edu/" target="_blank"&gt;training.bioinformatics.ucdavis.edu&lt;/a&gt;&lt;br /&gt;
The UC Davis Bioinformatics Training program offers several intensive&amp;nbsp;&lt;a href="http://training.bioinformatics.ucdavis.edu/bootcamps/" title="Boot Camps | UC Davis Bioinformatics Training Program"&gt;short bootcamp workshops&lt;/a&gt;&amp;nbsp;on RNA-seq, data analysis and visualization, and cloud computing with a focus on Amazon's computing resources. They also offer a week-long&amp;nbsp;&lt;a href="http://training.bioinformatics.ucdavis.edu/courses/" title="Courses | UC Davis Bioinformatics Training Program"&gt;Bioinformatics Short Course&lt;/a&gt;, covering in-depth the practical theory and application of cutting-edge next-generation sequencing techniques. Every course's documentation is&amp;nbsp;&lt;a href="http://training.bioinformatics.ucdavis.edu/documentation/" title="Documentation | UC Davis Bioinformatics Training Program"&gt;freely available online&lt;/a&gt;, even if you didn't take the course.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;MSU NGS Summer Course:&amp;nbsp;&lt;/b&gt;&lt;a href="http://bioinformatics.msu.edu/ngs-summer-course-2013" target="_blank"&gt;bioinformatics.msu.edu/ngs-summer-course-2013&lt;/a&gt;&lt;br /&gt;
This intensive two week summer course will introduce attendees with a strong biology background to the practice of analyzing short-read sequencing data from Illumina and other next-gen platforms. The first week will introduce students to computational thinking and large-scale data analysis on UNIX platforms. The second week will focus on mapping, assembly, and analysis of short-read data for resequencing, ChIP-seq, and RNAseq.&amp;nbsp;&lt;a href="http://ged.msu.edu/angus/"&gt;Materials from previous courses are freely available online&lt;/a&gt;&amp;nbsp;under a CC-by-SA license.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;Genetic Analysis of Complex Human Diseases:&amp;nbsp;&lt;/b&gt;&lt;a href="http://hihg.med.miami.edu/educational-programs/analysis-of-complex-human-diseases/genetic-analysis-of-complex-human-diseases/" target="_blank"&gt;hihg.med.miami.edu/edu...&lt;/a&gt;&lt;br /&gt;
The Genetic Analysis of Complex Human Diseases is a comprehensive four-day course directed toward physician-scientists and other medical researchers. The course will introduce state-of-the-art approaches for the mapping and characterization of human inherited disorders with an emphasis on the mapping of genes involved in common and genetically complex disease phenotypes. The primary goal of this course is to provide participants with an overview of approaches to identifying genes involved in complex human diseases. At the end of the course, participants should be able to identify the key components of a study team, and communicate effectively with specialists in various areas to design and execute a study. The course is in Miami Beach, FL. (Full Disclosure: I teach a section in this course.) Most of the course material from previous years is not available online, but&amp;nbsp;&lt;a href="http://figshare.com/authors/Stephen%20Turner/100774" title="Stephen Turner"&gt;my RNA-seq &amp;amp; methylation lectures are on Figshare&lt;/a&gt;.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;UAB Short Course on Statistical Genetics and Genomics:&amp;nbsp;&lt;/b&gt;&lt;a href="http://www.soph.uab.edu/ssg/nigmsstatgen/third" target="_blank"&gt;soph.uab.edu/ssg/...&lt;/a&gt;&lt;br /&gt;
Focusing on the state-of-art methodology to analyze complex traits, this five-day course will offer an interactive program to enhance researchers' ability to understand &amp;amp; use statistical genetic methods, as well as implement &amp;amp; interpret sophisticated genetic analyses. Topics include GWAS Design/Analysis/Imputation/Interpretation; Non-Mendelian Disorders Analysis; Pharmacogenetics/Pharmacogenomics; ELSI; Rare Variants &amp;amp; Exome Sequencing; Whole Genome Prediction; Analysis of DNA Methylation Microarray Data; Variant Calling from NGS Data; RNAseq: Experimental Design and Data Analysis; Analysis of ChIP-seq Data; Statistical Methods for NGS Data; Discovering new drugs &amp;amp; diagnostics from 300 billion points of data. Video recording from the 2012 course are&amp;nbsp;&lt;a href="http://www.soph.uab.edu/ssg/nigmsstatgen/second" title="2nd Short Course on Statistical Genetics and Genomics  | Section on Statistical Genetics"&gt;available online&lt;/a&gt;.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;MBL Molecular Evolution Workshop:&amp;nbsp;&lt;/b&gt;&lt;a href="http://hermes.mbl.edu/education/courses/special_topics/mole.html" target="_blank"&gt;hermes.mbl.edu/education/...&lt;/a&gt;&lt;br /&gt;
One of the longest-running courses listed here (est. 1988), the Workshop on Molecular Evolution at Woods Hole presents a series of lectures, discussions, and bioinformatic exercises that span contemporary topics in molecular evolution. The course addresses phylogenetic analysis, population genetics, database and sequence matching, molecular evolution and development, and comparative genomics,&amp;nbsp;&lt;a href="https://molevol.mbl.edu/wiki/index.php/Software"&gt;using software packages&lt;/a&gt;&amp;nbsp;including AWTY, BEAST, BEST, Clustal W/X, FASTA, FigTree, GARLI, MIGRATE, LAMARC, MAFFT, MP-EST, MrBayes, PAML, PAUP*, PHYLIP, STEM, STEM-hy, and SeaView. Some of the course materials can be found by digging around the&amp;nbsp;&lt;a href="https://molevol.mbl.edu/wiki/index.php/Main_Page"&gt;course wiki&lt;/a&gt;.&lt;br /&gt;
&lt;b&gt;&lt;i&gt;&lt;br /&gt;&lt;/i&gt;&lt;/b&gt;
&lt;b&gt;&lt;i&gt;&lt;br class="Apple-interchange-newline" /&gt;Online Material:&lt;/i&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;Canadian Bioinformatics Workshops:&amp;nbsp;&lt;/b&gt;&lt;a href="http://bioinformatics.ca/workshops" target="_blank"&gt;bioinformatics.ca/workshops&lt;/a&gt;&lt;br /&gt;
(&lt;i&gt;In person workshop described above&lt;/i&gt;). Course material from&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012" title="Past Workshops - 2012 | Bioinformatics.ca"&gt;past workshops&lt;/a&gt;&amp;nbsp;is freely available online, including both audio/video lectures and slideshows. Topics include&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/microarray-data-analysis" title="Microarray Data Analysis | Bioinformatics.ca"&gt;microarray analysis&lt;/a&gt;,&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/informatics-high-throughput-sequencing-data" title="Informatics on High Throughput Sequencing Data | Bioinformatics.ca"&gt;RNA-seq analysis&lt;/a&gt;, genome rearrangements, copy number alteration,&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/pathway-and-network-analysis-omics-data" title="Pathway and Network Analysis of -omics Data  | Bioinformatics.ca"&gt;network/pathway analysis&lt;/a&gt;, genome visualization, gene function prediction, functional annotation, data analysis using R,&amp;nbsp;&lt;a href="http://bioinformatics.ca/workshops/2012/informatics-and-statistics-metabolomics" title="Informatics and Statistics for Metabolomics | Bioinformatics.ca"&gt;statistics for metabolomics&lt;/a&gt;, and&lt;a href="http://bioinformatics.ca/workshops/2012/informatics-high-throughput-sequencing-data" title="Informatics on High Throughput Sequencing Data | Bioinformatics.ca"&gt;much more&lt;/a&gt;.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;UC Davis Bioinformatics Training Program:&amp;nbsp;&lt;/b&gt;&lt;a href="http://training.bioinformatics.ucdavis.edu/" target="_blank"&gt;training.bioinformatics.ucdavis.edu&lt;/a&gt;&lt;br /&gt;
(&lt;i&gt;In person workshop described above&lt;/i&gt;). Every course's documentation is&amp;nbsp;&lt;a href="http://training.bioinformatics.ucdavis.edu/documentation/" title="Documentation | UC Davis Bioinformatics Training Program"&gt;freely available online&lt;/a&gt;, even if you didn't take the course.&amp;nbsp;&lt;a href="http://training.bioinformatics.ucdavis.edu/documentation/" title="Documentation | UC Davis Bioinformatics Training Program"&gt;Past topics&lt;/a&gt;&amp;nbsp;include Galaxy, Bioinformatics for NGS, cloud computing, and RNA-seq.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;MSU NGS Summer Course:&amp;nbsp;&lt;/b&gt;&lt;a href="http://bioinformatics.msu.edu/ngs-summer-course-2013" target="_blank"&gt;bioinformatics.msu.edu/ngs-summer-course-2013&lt;/a&gt;&lt;br /&gt;
(&lt;i&gt;In person workshop described above&lt;/i&gt;).&amp;nbsp;&lt;a href="http://ged.msu.edu/angus/"&gt;Materials from previous courses are freely available online&lt;/a&gt;&amp;nbsp;under a CC-by-SA license, which cover mapping, assembly, and analysis of short-read data for resequencing, ChIP-seq, and RNAseq.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;EMBL-EBI Train Online:&amp;nbsp;&lt;/b&gt;&lt;a href="http://www.ebi.ac.uk/training/online/" target="_blank"&gt;www.ebi.ac.uk/training/online&lt;/a&gt;&lt;br /&gt;
Train online provides free courses on Europe's most widely used data resources, created by experts at EMBL-EBI and collaborating institutes. Topics include&amp;nbsp;&lt;a href="http://www.ebi.ac.uk/training/online/subjects/11" title="Train online courses | Train online"&gt;Genes and Genomes&lt;/a&gt;,&amp;nbsp;&lt;a href="http://www.ebi.ac.uk/training/online/subjects/13" title="Train online courses | Train online"&gt;Gene Expression&lt;/a&gt;,&lt;a href="http://www.ebi.ac.uk/training/online/subjects/479" title="Train online courses | Train online"&gt;Interactions, Pathways, and Networks&lt;/a&gt;, and others. Of particular interest may be the&amp;nbsp;&lt;a href="http://www.ebi.ac.uk/training/online/course/embo-practical-course-analysis-high-throughput-seq" title="EMBO Practical Course on Analysis of High-Throughput Sequencing Data  | Train online"&gt;Practical Course on Analysis of High-Throughput Sequencing Data&lt;/a&gt;, which covers Bioconductor packages for short read analysis, ChIP-Seq, RNA-seq, and allele-specific expression &amp;amp; eQTLs.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;UC Riverside Bioinformatics Manuals:&amp;nbsp;&lt;/b&gt;&lt;a href="http://manuals.bioinformatics.ucr.edu/" target="_blank"&gt;manuals.bioinformatics.ucr.edu&lt;/a&gt;&lt;br /&gt;
This is an excellent collection of manuals and code snippets. Topics include&amp;nbsp;&lt;a href="http://manuals.bioinformatics.ucr.edu/home/programming-in-r" title="R Programming - Manuals"&gt;Programming in R&lt;/a&gt;,&amp;nbsp;&lt;a href="http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual" title="R and Bioconductor - Manuals"&gt;R+Bioconductor&lt;/a&gt;,&amp;nbsp;&lt;a href="http://manuals.bioinformatics.ucr.edu/home/ht-seq" title="NG Sequence - Manuals"&gt;Sequence Analysis with R and Bioconductor&lt;/a&gt;,&amp;nbsp;&lt;a href="http://manuals.bioinformatics.ucr.edu/home/gui-ngs-analysis" title="NGS GUI - Manuals"&gt;NGS analysis with Galaxy and IGV&lt;/a&gt;, basic&lt;a href="http://manuals.bioinformatics.ucr.edu/home/linux-basics" title="Linux Basics - Manuals"&gt;Linux skills&lt;/a&gt;, and others.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;Software Carpentry:&amp;nbsp;&lt;/b&gt;&lt;a href="http://software-carpentry.org/" target="_blank"&gt;software-carpentry.org&lt;/a&gt;&lt;br /&gt;
Software Carpentry helps researchers be more productive by teaching them basic computing skills. We recently ran a&amp;nbsp;&lt;a href="http://gettinggeneticsdone.blogspot.com/2013/03/software-carpentry-bootcamp-at.html"&gt;2-day Software Carpentry Bootcamp here at UVA&lt;/a&gt;. Check out the&amp;nbsp;&lt;a href="http://software-carpentry.org/4_0/index.html"&gt;online lectures&lt;/a&gt;&amp;nbsp;for some introductory material on Unix, Python, Version Control, Databases, Automation, and many other topics.&lt;br /&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;
&lt;b&gt;Coursera:&amp;nbsp;&lt;/b&gt;&lt;a href="https://www.coursera.org/courses" target="_blank"&gt;coursera.org/courses&lt;/a&gt;&lt;br /&gt;
Coursera partners with top universities to offer courses online for anytone to take, for free. Courses are usually 4-6 weeks, and consist of video lectures, quizzes, assignments, and exams. Joining a course gives you access to the course's forum where you can interact with the instructor and other participants. Relevant courses include&amp;nbsp;&lt;a href="https://www.coursera.org/course/dataanalysis"&gt;Data Analysis&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/compdata"&gt;Computing for Data Analysis using R&lt;/a&gt;, and&amp;nbsp;&lt;a href="https://www.coursera.org/course/bioinformatics"&gt;Bioinformatics Algorithms&lt;/a&gt;, among others. You can also&amp;nbsp;&lt;a href="http://www.youtube.com/user/jtleek2007/videos?flow=grid&amp;amp;tag_id=UC8xNPQ-3a5t9uMU7Vah-jWA.3.coursera&amp;amp;view=46" title="Jeff Leek - YouTube"&gt;view all of Jeff Leek's Data Analysis lectures on Youtube&lt;/a&gt;.&lt;br /&gt;
&lt;b&gt;Rosalind:&amp;nbsp;&lt;/b&gt;&lt;a href="http://rosalind.info/" target="_blank"&gt;http://rosalind.info&lt;/a&gt;&lt;br /&gt;
Quite different from the others listed here, Rosalind is a platform for learning bioinformatics through gaming-like problem solving. Visit the&amp;nbsp;&lt;a href="http://rosalind.info/problems/list-view/?location=python-village" target="_blank" title="ROSALIND | Problems"&gt;Python Village&lt;/a&gt;&amp;nbsp;to learn the basics of Python. Arm yourself at the&lt;a href="http://rosalind.info/problems/list-view/?location=bioinformatics-armory" target="_blank" title="ROSALIND | Problems"&gt;Bioinformatics Armory&lt;/a&gt;, equipping yourself with existing ready-to-use bioinformatics software tools. Or storm the&amp;nbsp;&lt;a href="http://rosalind.info/problems/list-view/" target="_blank" title="ROSALIND | Problems"&gt;Bioinformatics Stronghold&lt;/a&gt;, implementing your own algorithms for computational mass spectrometry, alignment, dynamic programming, genome assembly, genome rearrangements, phylogeny, probability, string algorithms and others.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;&lt;i&gt;&lt;br class="Apple-interchange-newline" /&gt;Other Resources:&lt;/i&gt;&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="http://ged.msu.edu/angus/bioinformatics-courses.html" target="_blank" title="A list of bioinformatics courses — ANGUS 2.0 documentation"&gt;Titus Brown's list bioinformatics courses&lt;/a&gt;: Includes a few others not listed here (also see the comments).&lt;/li&gt;
&lt;li&gt;&lt;a href="http://gmod.org/wiki/Training_and_Outreach" target="_blank" title="Training and Outreach - GMOD"&gt;GMOD Training and Outreach&lt;/a&gt;: GMOD is the Generic Model Organism Database project, a collection of open source software tools for creating and managing genome-scale biological databases. This page links out to tutorials on&amp;nbsp;&lt;a href="http://gmod.org/wiki/GMOD_Components" title="GMOD Components - GMOD"&gt;GMOD Components&lt;/a&gt;&amp;nbsp;such as Apollo, BioMart, Galaxy, GBrowse, MAKER, and others.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://seqanswers.com/" target="_blank" title="SEQanswers Home"&gt;Seqanswers.com&lt;/a&gt;: A discussion forum for anything related to Bioinformatics, including Q&amp;amp;A, paper discussions, new software announcements, protocols, and more.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.biostars.org/" target="_blank" title="Bioinformatics Answers"&gt;Biostars.org&lt;/a&gt;: Similar to SEQanswers, but more strictly a Q&amp;amp;A site.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.bioconductor.org/help/mailing-list/" target="_blank" title="Bioconductor - Mailing Lists"&gt;BioConductor Mailing list&lt;/a&gt;: A very active mailing list for getting help with Bioconductor packages. Make sure you do some Google searching yourself first before posting to this list.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.bioconductor.org/help/events/" target="_blank" title="Bioconductor - Events"&gt;Bioconductor Events&lt;/a&gt;: List of upcoming and prior Bioconductor training and events worldwide.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://wiki.galaxyproject.org/Learn" target="_blank" title="Learn - Galaxy Wiki"&gt;Learn Galaxy&lt;/a&gt;: Screencasts and tutorials for learning to use Galaxy.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://wiki.galaxyproject.org/Events" target="_blank" title="Events - Galaxy Wiki"&gt;Galaxy Event Horizon&lt;/a&gt;: Worldwide Galaxy-related events (workshops, training, user meetings) are listed here.&lt;/li&gt;
&lt;li&gt;&lt;a href="https://main.g2.bx.psu.edu/u/jeremy/p/galaxy-rna-seq-analysis-exercise" target="_blank"&gt;Galaxy RNA-Seq Exercise&lt;/a&gt;: Run through a small RNA-seq study from start to finish using Galaxy.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.youtube.com/user/RafalabChannel" target="_blank" title="Rafael Irizarry - YouTube"&gt;Rafael Irizarry's Youtube Channel&lt;/a&gt;: Several statistics and bioinformatics video lectures.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1002632" target="_blank" title="PLOS Computational Biology: An Online Bioinformatics Curriculum"&gt;PLoS Comp Bio Online Bioinformatics Curriculum&lt;/a&gt;: A perspective paper by David B Searls outlining a series of free online learning initiatives for beginning to advanced training in biology, biochemistry, genetics, computational biology, genomics, math, statistics, computer science, programming, web development, databases, parallel computing, image processing, AI, NLP, and more.&lt;/li&gt;
&lt;li&gt;&lt;a href="http://gettinggeneticsdone.blogspot.com/" target="_blank" title="Getting Genetics Done"&gt;Getting Genetics Done&lt;/a&gt;: Shameless plug – I write a blog highlighting literature of interest, new tools, and occasionally tutorials in genetics, statistics, and bioinformatics. I recently wrote&amp;nbsp;&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/05/how-to-stay-current-in.html" title="Getting Genetics Done: How to Stay Current in Bioinformatics/Genomics"&gt;this post about how to stay current in bioinformatics &amp;amp; genomics&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/CuuhHM2rj8g" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/854262577874454434/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/04/list-of-bioinformatics-workshops-training.html#comment-form" title="13 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/854262577874454434?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/854262577874454434?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/04/list-of-bioinformatics-workshops-training.html" title="List of Bioinformatics Workshops and Training Resources" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>13</thr:total></entry><entry gd:etag="W/&quot;DEUHQng-eSp7ImA9WhBXFE0.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-6305468138511068428</id><published>2013-03-27T12:43:00.000-05:00</published><updated>2013-03-27T12:43:53.651-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-03-27T12:43:53.651-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Machine Learning" /><category scheme="http://www.blogger.com/atom/ns#" term="Clustering" /><category scheme="http://www.blogger.com/atom/ns#" term="Conferences" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Evolutionary Computation and Data Mining in Biology</title><content type="html">For over 15 years, members of the computer science, machine
learning, and data mining communities have gathered in a beautiful European
location each spring to share ideas about biologically-inspired
computation.&amp;nbsp; Stemming from the work of
John Holland who pioneered the field of genetic algorithms, multiple approaches
have been developed that exploit the dynamics of natural systems to solve
computational problems.&amp;nbsp; These algorithms
have been applied in a wide variety of fields, and to celebrate and cross-pollinate
ideas from these various disciplines the &lt;a href="http://www.kevinsim.co.uk/evostar2013/"&gt;EvoStar event&lt;/a&gt; co-locates five
conferences at the same venue, covering genetic programming (EuroGP),
combinatorial optimization (EvoCOP), music, art, and design (EvoMUSART),
multidisciplinary applications (EvoApplications), and computational biology
(EvoBIO).&amp;nbsp; EvoStar 2013 will be held in
Vienna, Austria on April 3-5, and is always expertly coordinated by the
wonderful &lt;a href="http://www.iidi.napier.ac.uk/c/people/peopleid/126"&gt;Jennifer
Willies&lt;/a&gt; from Napier University, UK. Multiple research groups from the US and
Europe will attend to present their exciting work in these areas.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="MsoNormal"&gt;
&lt;o:p&gt;&lt;/o:p&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
Many problems in bioinformatics and statistical analysis use
what are considered “greedy” algorithms to fit parameters to data – that is,
they settle on a &lt;i&gt;nearby &lt;/i&gt;collection of
parameters as the solution and potentially miss a &lt;i&gt;global best solution&lt;/i&gt;.&amp;nbsp; This
problem is well-known in the computer science community for toy problems like &lt;a href="http://en.wikipedia.org/wiki/Bin_packing_problem"&gt;bin
packing&lt;/a&gt;&amp;nbsp;or the &lt;a href="http://en.wikipedia.org/wiki/Knapsack_problem"&gt;knapsack
problem&lt;/a&gt;.&amp;nbsp; In human genetics,
related problems are partitioning complex pedigrees or selecting maximally
unrelated individuals from a dataset, and can also appear when maximizing
likelihood equations.&lt;/div&gt;
&lt;o:p&gt;&lt;/o:p&gt;&lt;br /&gt;
&lt;div class="MsoNormal"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-nx9KQ7ZGWuQ/UVCKiTzee2I/AAAAAAAABSg/lrv82hIIjs4/s1600/Untitled.png" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-nx9KQ7ZGWuQ/UVCKiTzee2I/AAAAAAAABSg/lrv82hIIjs4/s1600/Untitled.png" height="320" width="207" /&gt;&lt;/a&gt;&lt;/div&gt;
EvoBIO focuses on using biologically-inspired algorithms
(like genetic algorithms) to improve performance for many bioinformatics
tasks.&amp;nbsp; For example, Stephen and I have
both applied these methods for analysis of genetic data using &lt;a href="http://link.springer.com/chapter/10.1007%2F978-3-642-01184-9_8?LI=true"&gt;neural networks&lt;/a&gt;, and for &lt;a href="http://link.springer.com/chapter/10.1007%2F978-3-540-78757-0_3?LI=true#page-1"&gt;forward-time
genetic data simulation&lt;/a&gt; (additional details &lt;a href="http://books.google.com/books?hl=en&amp;amp;lr=&amp;amp;id=FLKxycSL3ukC&amp;amp;oi=fnd&amp;amp;pg=PA1&amp;amp;ots=jA6iR8Iqt0&amp;amp;sig=vlDMC4HIm-RKAXsgK94qk_5KkBA#v=onepage&amp;amp;q&amp;amp;f=false"&gt;here&lt;/a&gt;).&lt;/div&gt;
&lt;o:p&gt;&lt;/o:p&gt;

&lt;br /&gt;
&lt;div class="MsoNormal"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
EvoBIO is very pleased to be sponsored by BMC Biodata
Mining, a natural partner for this conference. &amp;nbsp;I recently &lt;a href="http://blogs.biomedcentral.com/bmcblog/2013/03/27/evobio-2013-seeking-creative-solutions-to-complex-biological-problems/"&gt;wrote a blog post&lt;/a&gt; for BioMed Central about EvoBIO as well. &amp;nbsp;Thanks to their sponsorship, the winner of the EvoBIO best paper award will receive free publication in Biodata Mining, and runners-up will receive 25% discount off the article processing charge. &lt;br /&gt;
&lt;br /&gt;
So, if you are in the mood for a new conference
and would like to see and influence some of these creative approaches to data
analysis, consider attending EvoSTAR -- We'd love to see you there!&lt;br /&gt;
&lt;o:p&gt;&lt;/o:p&gt;&lt;/div&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/BX78x0ObS7M" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/6305468138511068428/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/03/evolutionary-computation-and-data.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6305468138511068428?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6305468138511068428?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/03/evolutionary-computation-and-data.html" title="Evolutionary Computation and Data Mining in Biology" /><author><name>Will</name><uri>http://www.blogger.com/profile/09703349044940180835</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="http://1.bp.blogspot.com/-DPV6nrTtGHY/UPTSZniomRI/AAAAAAAABRc/_nDd2s1gwko/s220/19bf088.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-nx9KQ7ZGWuQ/UVCKiTzee2I/AAAAAAAABSg/lrv82hIIjs4/s72-c/Untitled.png" height="72" width="72" /><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;CkYHQnw6fCp7ImA9WhBQF00.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-8376227675976802899</id><published>2013-03-19T08:35:00.000-05:00</published><updated>2013-03-19T08:35:33.214-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-03-19T08:35:33.214-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Databases" /><category scheme="http://www.blogger.com/atom/ns#" term="Tutorials" /><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Software" /><category scheme="http://www.blogger.com/atom/ns#" term="Conferences" /><category scheme="http://www.blogger.com/atom/ns#" term="SQL" /><category scheme="http://www.blogger.com/atom/ns#" term="Announcements" /><title>Software Carpentry Bootcamp at University of Virginia</title><content type="html">A couple of weeks ago I, with the help of &lt;a href="http://www.uvacse.virginia.edu/"&gt;others&lt;/a&gt; &lt;a href="http://www2.lib.virginia.edu/brown/data/"&gt;here&lt;/a&gt; &lt;a href="http://www.hsl.virginia.edu/"&gt;at&lt;/a&gt; &lt;a href="http://www.virginia.edu/vpr/"&gt;UVA&lt;/a&gt;,&amp;nbsp;organized a &lt;a href="http://software-carpentry.org/"&gt;Software Carpentry&lt;/a&gt; &lt;a href="http://software-carpentry.org/bootcamps/index.html"&gt;bootcamp&lt;/a&gt;, instructed by &lt;a href="http://users.ecs.soton.ac.uk/stc/"&gt;Steve Crouch&lt;/a&gt;, &lt;a href="https://www.msu.edu/~carlosja/"&gt;Carlos Anderson&lt;/a&gt;, and &lt;a href="http://www.bendmorris.com/"&gt;Ben Morris&lt;/a&gt;. The day before the course started, Charlottesville was racked by nearly a foot of snow, widespread power outages, and many cancelled incoming flights. Luckily our instructors arrived just in time, and power was (mostly) restored shortly before the boot camp started. Despite the conditions, the course was very well-attended.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-7R0ojB-PsJs/UUc7vF5T28I/AAAAAAABDP4/tKl-V3JjGjY/s1600/swc-workshop.jpg" imageanchor="1" style="clear: left; float: left; margin-bottom: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="240" src="http://1.bp.blogspot.com/-7R0ojB-PsJs/UUc7vF5T28I/AAAAAAABDP4/tKl-V3JjGjY/s320/swc-workshop.jpg" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;
Software Carpentry's aim is to teach researchers (usually graduate students) basic computing concepts and skills so that they can get more done in less time, and with less pain. They're a volunteer organization funded by Mozilla and the Sloan foundation, and led this two-day bootcamp completely free of charge to us.&lt;br /&gt;
&lt;br /&gt;
The course started out with a head-first dive into Unix and Bash scripting, followed by a tutorial on automation with Make, concluding the first day with an introduction to Python. The second day covered version control with git, Python code testing, and wrapped up with an introduction to databases and SQL. At the conclusion of the course, participants offered near-universal positive feedback, with the git and Make tutorials being exceptionally popular.&lt;br /&gt;
&lt;br /&gt;
Software Carpentry's approach to teaching these topics is unlike many others that I've seen. Rather than lecturing on for hours, the instructors inject very short (~5 minute) partnered exercises between every ~15 minutes of instruction in 1.5 hour sessions. With two full days of intensive instruction and your computer in front of you, it's all too easy to get distracted by an email, get lost in your everyday responsibilities, and zone out for the rest of the session. &amp;nbsp;The exercises keep participants paying attention and accountable to their partner.&lt;br /&gt;
&lt;br /&gt;
All of the bootcamp's materials are freely available:&lt;br /&gt;
&lt;br /&gt;
Unix and Bash: &lt;a href="https://github.com/redcurry/bash_tutorial"&gt;https://github.com/redcurry/bash_tutorial&lt;/a&gt;&lt;br /&gt;
Python Introduction: &lt;a href="https://github.com/redcurry/python_tutorial"&gt;https://github.com/redcurry/python_tutorial&lt;/a&gt;&lt;br /&gt;
Git tutorial: &lt;a href="https://github.com/redcurry/git_tutorial"&gt;https://github.com/redcurry/git_tutorial&lt;/a&gt;&lt;br /&gt;
Databases &amp;amp; SQL: &lt;a href="https://github.com/bendmorris/swc_databases"&gt;https://github.com/bendmorris/swc_databases&lt;/a&gt;&lt;br /&gt;
Everything else: &lt;a href="http://users.ecs.soton.ac.uk/stc/SWC/tutorial-materials-virginia.zip"&gt;http://users.ecs.soton.ac.uk/stc/SWC/tutorial-materials-virginia.zip&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
Perhaps more relevant to a broader audience are the &lt;a href="http://software-carpentry.org/4_0/index.html"&gt;online lectures and materials available on the Software Carpentry Website&lt;/a&gt;, which include all the above topics, as well as many others.&lt;br /&gt;
&lt;br /&gt;
We capped the course at 50, and had 95 register within a day of opening registration, so we'll likely do this again in the future. I sit in countless meetings where faculty lament how nearly all basic science researchers enter grad school or their postdoc woefully unprepared for this brave new world of data-rich high-throughput science.&amp;nbsp;&lt;a href="http://stephenturner.us/p/edu#online"&gt;Self-paced online learning&lt;/a&gt; works well for some, but if you're in a department or other organization that could benefit from a free, on-site, intensive introduction to the topics listed above, I highly recommend &lt;a href="mailto:info@software-carpentry.org"&gt;contacting Software Carpentry&lt;/a&gt; and organizing your own &lt;a href="http://software-carpentry.org/bootcamps/index.html"&gt;bootcamp&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
Finally, when organizing an optional section of the course, we let participants vote whether they preferred learning number crunching with NumPy, or SQL/databases; SQL won by a small margin. However, Katherine Holcomb in UVACSE has graciously volunteered to teach a&amp;nbsp;&lt;a href="http://www.uvacse.virginia.edu/events/introduction-to-numpy/"&gt;two-hour introduction to NumPy this week&lt;/a&gt;, regardless of whether you participated in the boot camp (although some basic Python knowledge is recommended). This (free) short course is this Thursday, March 21, 2-4pm, in the same place as the bootcamp (Brown Library Classroom in Clark Hall). &lt;a href="http://www.uvacse.virginia.edu/events/introduction-to-numpy/"&gt;Sign up here&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/l76Qa2XbCLo" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/8376227675976802899/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/03/software-carpentry-bootcamp-at.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8376227675976802899?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8376227675976802899?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/03/software-carpentry-bootcamp-at.html" title="Software Carpentry Bootcamp at University of Virginia" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-7R0ojB-PsJs/UUc7vF5T28I/AAAAAAABDP4/tKl-V3JjGjY/s72-c/swc-workshop.jpg" height="72" width="72" /><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;DUIBRHo_fip7ImA9WhBRFEw.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-7626583786825926842</id><published>2013-03-04T11:19:00.001-06:00</published><updated>2013-03-04T11:19:15.446-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-03-04T11:19:15.446-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Metagenomics" /><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Software" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Comparing Sequence Classification Algorithms for Metagenomics</title><content type="html">&lt;b&gt;Metagenomics&lt;/b&gt; is the study of DNA collected from environmental samples (e.g., seawater, soil, acid mine drainage, the human gut, sputum, pus, etc.). While traditional microbial genomics typically means sequencing a pure cultured isolate, metagenomics involves taking a culture-free environmental sample and sequencing a single gene (e.g. the 16S rRNA gene), multiple marker genes, or shotgun sequencing everything in the sample in order to determine what's there.&lt;br /&gt;
&lt;br /&gt;
A challenge in shotgun metagenomics analysis is the &lt;b&gt;sequence classification&lt;/b&gt;&amp;nbsp;problem: i.e., given a sequence, what's it's origin? I.e., did this sequence read come from &lt;i&gt;E. coli&lt;/i&gt;&amp;nbsp;or some other enteric bacteria? Note that sequence classification does not involve &lt;i&gt;genome assembly - &lt;/i&gt;sequence classification is done on &lt;i&gt;unassembled reads&lt;/i&gt;.&amp;nbsp;If you could perfectly classify the origin of every sequence read in your sample, you would know exactly what organisms are in your environmental sample and how abundant each one is.&lt;br /&gt;
&lt;br /&gt;
The solution to this problem isn't simply BLAST'ing every sequence read that comes off your HiSeq 2500 against NCBI nt/nr. The computational cost of this BLAST search would be many times more expensive than the sequencing itself. There are &lt;i&gt;many&lt;/i&gt;&amp;nbsp;algorithms for sequence classification. This paper examines a wide range of the available algorithms and software implementations for sequence classification as applied to metagenomic data:&lt;br /&gt;
&lt;br /&gt;
Bazinet, Adam L., and Michael P. Cummings. "&lt;a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3428669/"&gt;A comparative evaluation of sequence classification programs.&lt;/a&gt;" &lt;i&gt;BMC Bioinformatics&lt;/i&gt; 13.1 (2012): 92.&lt;br /&gt;
&lt;br /&gt;
In this paper, the authors comprehensively evaluated the performance of over 25 programs that fall into three categories: alignment-based, composition-based, and phylogeny-based. For illustrative purposes, the authors constructed a "phylogenetic tree" that shows how each of the 25 methods they evaluated are related to each other:&lt;br /&gt;
&lt;br /&gt;
&lt;table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style="text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-EuyxyKG3oqU/US4STwYxuBI/AAAAAAABDEU/wRx49tABaA8/s1600/classification-fig1.png" imageanchor="1" style="margin-left: auto; margin-right: auto;"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/-EuyxyKG3oqU/US4STwYxuBI/AAAAAAABDEU/wRx49tABaA8/s1600/classification-fig1.png" /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class="tr-caption" style="text-align: center;"&gt;&lt;b&gt;Figure 1: Program clustering.&amp;nbsp;&lt;/b&gt;A neighbor-joining tree that clusters the classification programs based on their similar attributes.&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;br /&gt;
The performance evaluation was done on several different datasets where the composition was known, using a similar set of evaluation criteria (sensitivity = number of correct assignments / number of sequences in the data; precision = number of correct assignments/number of assignments made). They concluded that the performance of particular methods varied widely between datasets due to reasons like highly variable taxonomic composition and diversity, level of sequence representation in underlying databases, read lengths, and read quality. The authors specifically point out that just because some methods lack sensitivity (as they've defined it), they are still useful because they have high precision. For example, marker-based approaches (like &lt;a href="http://metaphyler.cbcb.umd.edu/"&gt;Metaphyler&lt;/a&gt;) might only classify a small number of reads, but they're highly precise, and may still be enough to accurately recapitulate organismal distribution and abundance.&lt;br /&gt;
&lt;br /&gt;
Importantly, the authors note that you can't ignore computational requirements, which varied by orders of magnitude between methods. Selection of the right method depends on the goals (is sensitivity or precision more important?) and the available resources (time and compute power are never infinite - these are tangible limitations that are imposed in the real world).&lt;br /&gt;
&lt;br /&gt;
This paper was first received at BMC Bioinformatics a year ago, and since then many new methods for sequence classification have been published. Further, this paper only evaluates methods for classification of unassembled reads, and does not evaluate methods that rely on metagenome assembly (that's the subject of another much longer post, but check out &lt;a href="http://ivory.idyll.org/blog/"&gt;Titus Brown's blog&lt;/a&gt; for lots more on this topic).&lt;br /&gt;
&lt;br /&gt;
Overall, this paper was a great demonstration of how one might attempt to evaluate many different tools ostensibly aimed at solving the same problem but functioning in completely different ways.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3428669/"&gt;Bazinet, Adam L., and Michael P. Cummings. "A comparative evaluation of sequence classification programs."&amp;nbsp;&lt;i&gt;BMC Bioinformatics&lt;/i&gt;&amp;nbsp;13.1 (2012): 92.&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/p1t6ke6jThA" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/7626583786825926842/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/03/comparing-sequence-classification.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/7626583786825926842?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/7626583786825926842?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/03/comparing-sequence-classification.html" title="Comparing Sequence Classification Algorithms for Metagenomics" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-EuyxyKG3oqU/US4STwYxuBI/AAAAAAABDEU/wRx49tABaA8/s72-c/classification-fig1.png" height="72" width="72" /><thr:total>1</thr:total></entry><entry gd:etag="W/&quot;CEEFSH8-eip7ImA9WhBSE0U.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-8926766417280705079</id><published>2013-02-20T11:50:00.000-06:00</published><updated>2013-02-20T11:50:19.152-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-02-20T11:50:19.152-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Pathways" /><category scheme="http://www.blogger.com/atom/ns#" term="Tutorials" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Apps" /><category scheme="http://www.blogger.com/atom/ns#" term="Visualization" /><title>NetGestalt for Data Visualization in the Context of Pathways</title><content type="html">Many of you may be familiar with &lt;a href="http://bioinfo.vanderbilt.edu/webgestalt/"&gt;WebGestalt&lt;/a&gt;, a wonderful web utility developed by Bing Zhang at Vanderbilt for doing basic gene-set enrichment analyses.  Last year, we invited Bing to speak at our annual retreat for the &lt;a href="http://chgr.mc.vanderbilt.edu/page/education"&gt;Vanderbilt Graduate Program in Human Genetics&lt;/a&gt;, and he did not disappoint!  Bing walked us through his new tool called &lt;a href="http://www.netgestalt.org/"&gt;NetGestalt&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
NetGestalt provides users with the ability to overlay large-scale experimental data onto biological networks.  Data are loaded  using continuous and binary tracks that can contain either single or multiple lines of data (called composite tracks). Continuous tracks could be gene expression intensities from microarray data or any other quantitative measure that can be mapped to the genome. &amp;nbsp;Binary tracks are usually insertion/deletion regions, or called regions like ChIP peaks. &amp;nbsp;NetGestalt extends many of the features of WebGestalt, including enrichment analysis for modules within a biological network, and provides easy ways to visualize the overlay of multiple tracks with Venn diagrams.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-4feL_6H20dQ/USUJdXDJHCI/AAAAAAAABSQ/EZJP5Iw9XCw/s1600/netgestalt.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="337" src="http://2.bp.blogspot.com/-4feL_6H20dQ/USUJdXDJHCI/AAAAAAAABSQ/EZJP5Iw9XCw/s640/netgestalt.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Netgestalt provides a very nice interface for interacting with data. Extensive documentation on how to use it can be found &lt;a href="http://www.netgestalt.org/main/doc/NetGestalt_Manual.pdf"&gt;here.&lt;/a&gt; &amp;nbsp;Bing and his colleagues also went the extra mile to create &lt;a href="http://www.netgestalt.org/#1"&gt;video tutorials&lt;/a&gt;&amp;nbsp;on how to use their web tool, and walk you through an analysis of some tumor data. &lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.netgestalt.org/"&gt;http://www.netgestalt.org/&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/MOpLDZwTcaY" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/8926766417280705079/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/02/netgestalt-for-data-visualization-in.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8926766417280705079?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8926766417280705079?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/02/netgestalt-for-data-visualization-in.html" title="NetGestalt for Data Visualization in the Context of Pathways" /><author><name>Will</name><uri>http://www.blogger.com/profile/09703349044940180835</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="http://1.bp.blogspot.com/-DPV6nrTtGHY/UPTSZniomRI/AAAAAAAABRc/_nDd2s1gwko/s220/19bf088.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-4feL_6H20dQ/USUJdXDJHCI/AAAAAAAABSQ/EZJP5Iw9XCw/s72-c/netgestalt.png" height="72" width="72" /><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;DUQDQn08cSp7ImA9WhBTFkU.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-742926808275915523</id><published>2013-02-12T10:40:00.001-06:00</published><updated>2013-02-12T10:42:53.379-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-02-12T10:42:53.379-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>"Document Design and Purpose, Not Mechanics"</title><content type="html">If you ever write code for scientific computing (chances are you do if you're here), stop what you're doing and spend 8 minutes reading this open-access paper:&lt;br /&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
Wilson et al. Best Practices for Scientific Computing. &lt;a href="http://arxiv.org/abs/1210.0530"&gt;arXiv:1210.0530&lt;/a&gt; (2012). (&lt;a href="http://arxiv.org/pdf/1210.0530v3"&gt;Direct link to PDF&lt;/a&gt;).&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
The paper makes a number of good points regarding software as a tool just like any other lab equipment: it should be built, validated, and used as carefully as any other physical instrumentation. Yet most scientists who write software are self-taught, and haven't been properly trained in fundamental software development skills.&amp;nbsp;&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
The paper outlines ten practices every computational biologist should adopt when writing code for research computing. Most of these are the usual suspects that you'd probably guess - using version control, workflow management, writing good documentation, modularizing code into functions, unit testing, agile development, etc. One that particularly jumped out at me was the recommendation to document design and purpose, not mechanics.&amp;nbsp;&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
We all know that good comments and documentation is critical for code reproducibility and maintenance, but inline documentation that recapitulates the code is hardly useful. Instead, we should aim to document the underlying ideas, interface, and reasons, not the implementation.&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
For example, the following commentary is hardly useful:&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;# Increment the variable "i" by one.&lt;/span&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;i = i+1&lt;!-----&gt;&lt;!-----&gt;&lt;/span&gt;&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
The real recommendation here is that if your code requires such substantial documentation of the actual implementation to be understandable, &lt;i&gt;it's better to spend the time rewriting the code rather than writing a lengthy description of what it does&lt;/i&gt;. I'm very guilty of doing this with R code, nesting multiple levels of functions and vector operations:&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;# It would take a paragraph to explain what this is doing.&lt;/span&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;# Better to break up into multiple lines of code.&lt;/span&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;sapply(data.frame(n=sapply(x, function(d) sum(is.na(d)))), function(dd) mean(dd))&lt;/span&gt;&lt;/div&gt;
&lt;div class="p2"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
It would take much more time to properly document what this is doing than it would take to split the operation into manageable chunks over multiple lines such that the code no longer needs an explanation. We're not playing &lt;a href="http://en.wikipedia.org/wiki/Code_golf"&gt;code golf&lt;/a&gt; here - using fewer lines doesn't make you a better programmer.&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;a href="http://arxiv.org/abs/1210.0530"&gt;Best Practices for Scientific Computing&lt;/a&gt;&lt;/div&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/GaRusax3It0" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/742926808275915523/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/02/document-design-and-purpose-not.html#comment-form" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/742926808275915523?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/742926808275915523?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/02/document-design-and-purpose-not.html" title="&quot;Document Design and Purpose, Not Mechanics&quot;" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>2</thr:total></entry><entry gd:etag="W/&quot;DEUCQHw8eip7ImA9WhBTEkU.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-8205877176050413452</id><published>2013-01-28T07:09:00.000-06:00</published><updated>2013-02-07T19:17:41.272-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-02-07T19:17:41.272-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="RNA-Seq" /><category scheme="http://www.blogger.com/atom/ns#" term="Statistics" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Apps" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Scotty, We Need More Power! Power, Sample Size, and Coverage Estimation for RNA-Seq</title><content type="html">Two of the most common questions at the beginning of an RNA-seq experiments are "how many reads do I need?" and "how many replicates do I need?". &lt;a href="http://bioinformatics.oxfordjournals.org/content/early/2013/01/12/bioinformatics.btt015.abstract"&gt;This paper&lt;/a&gt; describes a &lt;a href="http://euler.bc.edu/marthlab/scotty/scotty.php"&gt;web application&lt;/a&gt; for designing RNA-seq applications that calculates an appropriate sample size and read depth to satisfy user-defined criteria such as cost, maximum number of reads or replicates attainable, etc. The power and sample size estimations are based on a t-test, which the authors claim, performs no worse than the negative binomial models implemented by popular RNA-seq methods such as &lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/20979621"&gt;DESeq&lt;/a&gt;, when there are three or more replicates present. Empirical distributions are taken from either (1) pilot data that the user can upload, or (2) built in publicly available data. The authors find that there is substantial heterogeneity between experiments (technical variation is larger than biological variation in many cases), and that power and sample size estimation will be more accurate when the user provides their own pilot data.&lt;br /&gt;
&lt;strike&gt;&lt;br /&gt;&lt;/strike&gt;
&lt;strike&gt;My only complaint, for all the reasons expressed in&amp;nbsp;&lt;a href="http://gettinggeneticsdone.blogspot.com/2013/01/stop-hosting-data-and-code-on-your-lab.html"&gt;my previous blog post about why you shouldn't host things like this exclusively on your lab website&lt;/a&gt;, is that the code to run this analysis doesn't appear to be available to save, study, modify, maintain, or archive. When lead author &lt;a href="https://twitter.com/michelebusby"&gt;Michele Busby&lt;/a&gt; leaves Gabor Marth's lab, hopefully the app doesn't fall into the &lt;a href="http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0024914"&gt;graveyard of computational biology web apps&lt;/a&gt;.&amp;nbsp;&lt;/strike&gt;&amp;nbsp;&lt;b&gt;Update 2/7/13&lt;/b&gt;: Michele Busby created a public Github repository for the Scotty code:&amp;nbsp;&lt;a href="https://github.com/mbusby/Scotty"&gt;https://github.com/mbusby/Scotty&lt;/a&gt;&lt;br /&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;a href="http://en.wikipedia.org/wiki/Wikipedia:Too_long;_didn't_read"&gt;tl;dr&lt;/a&gt;? There's a new web app that does power, sample size, and coverage calculations for RNA-seq, but it only works well if the pilot or public data you give it closely matches the actual data you'll collect.&amp;nbsp;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
Paper: &lt;a href="http://bioinformatics.oxfordjournals.org/content/early/2013/01/12/bioinformatics.btt015.abstract"&gt;Busby, et al. "Scotty: A Web Tool For Designing RNA-Seq Experiments to Measure Differential Gene Expression." Bioinformatics (2013):&amp;nbsp;10.1093/bioinformatics/btt015.&lt;/a&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
Web app:&amp;nbsp;&lt;a href="http://euler.bc.edu/marthlab/scotty/scotty.php"&gt;http://euler.bc.edu/marthlab/scotty/scotty.php&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
Source code:&amp;nbsp;&lt;a href="https://github.com/mbusby/Scotty"&gt;https://github.com/mbusby/Scotty&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/TeG_QDeN9Ao" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/8205877176050413452/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/scotty-power-sample-size-coverage-rna-seq.html#comment-form" title="4 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8205877176050413452?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8205877176050413452?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/scotty-power-sample-size-coverage-rna-seq.html" title="Scotty, We Need More Power! Power, Sample Size, and Coverage Estimation for RNA-Seq" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>4</thr:total></entry><entry gd:etag="W/&quot;CUAESHw8cSp7ImA9WhNbEko.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-4042937297568903723</id><published>2013-01-14T21:47:00.000-06:00</published><updated>2013-01-15T12:15:09.279-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-15T12:15:09.279-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Conferences" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>The Pacific Symposium on Biocomputing 2013</title><content type="html">&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-Yzaa4dSZuOM/UPTQHo1W1FI/AAAAAAAABRE/fOyVhE52hEY/s1600/DSC_0427.JPG" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="182" width="400" src="http://3.bp.blogspot.com/-Yzaa4dSZuOM/UPTQHo1W1FI/AAAAAAAABRE/fOyVhE52hEY/s400/DSC_0427.JPG" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br&gt;
For 18 years now, computational biologists have convened on the beautiful islands of Hawaii to present and discuss research emerging from new areas of biomedicine.  PSB Conference Chairs Teri Klein (&lt;a href="https://twitter.com/teriklein"&gt;@teriklein&lt;/a&gt;), Keith Dunker, Russ Altman (&lt;a href="https://twitter.com/Rbaltman"&gt;@Rbaltman&lt;/a&gt;) and Larry Hunter (&lt;a href="https://twitter.com/ProfLHunter"&gt;@ProfLHunter&lt;/a&gt;) organize innovative sessions and tutorials that are always interactive and thought-provoking.  This year, sessions included Computational Drug Repositioning, Epigenomics, Aberrant Pathway and Network Activity, Personalized Medicine, Phylogenomics and Population Genomics, Post-Next Generation Sequencing, and Text and Data Mining.  The Proceedings are available &lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/"&gt;online here&lt;/a&gt;, and a few of the highlights are:&lt;br&gt;&lt;br&gt;

&lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/cheng.pdf"&gt;Cheng et al.&lt;/a&gt; examine various analytical methods for processing data from the Connectivity Map, a dataset of gene expression changes due to small molecule treatment.  They compare methods for identifying drug-induced gene expression profiles to a benchmark based on the Anatomical Theraputic Chemical (ATC) system with the hope of discovering additional mechanisms of action.&lt;br&gt;&lt;br&gt;

&lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/huang.pdf"&gt;Huang et al.&lt;/a&gt; developed a recursive K-means spectral clustering algorithm and applied this method to gene expression data from the Cancer Genome Atlas.  It provides better cluster separation than traditional hierarchical clustering, and better execution time than similar K-means approaches.&lt;br&gt;&lt;br&gt; 
 
&lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/schrider.pdf"&gt;Schrider et al.&lt;/a&gt; used pooled paired-end sequence data from multiple Drosophila melanogaster species along the eastern US coast to identify copy number variants under selective pressure.  Many of the CNVs identified contain CYP enzymes likely influencing insecticide resistance.  Schrider also pointed out in his talk that human salivary amylase (AMY1) has copy numbers that are differentiated across human populations due to differences in dietary starch content.  Cool!&lt;br&gt;&lt;br&gt;

&lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/verspoor.pdf"&gt;Verspoor et al.&lt;/a&gt; presented an awesome application of text mining to identify catalytic protein residues from the biomedical literature.  Text mining tasks are always wrought with difficulties such as identifier ambiguity and resolution, or simply identifying the corpus of text needed for the task.  Using Literature-Enhanced Automated Prediction of Functional Sites (LEAP-FS) and the Protein Data Bank (with Pubmed references), they compare their text mining approach to the Catalytic Site Atlas as a ‘silver standard’.  Despite the difficulty, a simple classifier gives an accuracy around 70% (measured by F-statistic).&lt;br&gt;&lt;br&gt;
   
Also, my colleague Ting Hu presented &lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/hu.pdf"&gt;her excellent work&lt;/a&gt; on statistical epistasis networks which use entropy-based measures to identify high-order interactions in genetic data.  And in case you are interested, I’ll end by shamelessly listing our own publications in &lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/holzinger.pdf"&gt;complex data analysis&lt;/a&gt; and &lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/moore.pdf"&gt;rare-variant population structure&lt;/a&gt; (with Marylyn Ritchie), and &lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/crawford.pdf"&gt;performance of the Illumina Metabochip in Hispanic samples&lt;/a&gt; and &lt;a href="http://psb.stanford.edu/psb-online/proceedings/psb13/bush.pdf"&gt;high-throughput epidemiology&lt;/a&gt; (with Dana Crawford).&lt;br&gt;&lt;br&gt;

PSB is always a fantastic meeting – hope to see you in 2014!&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/Ih2CrzvEx3Y" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/4042937297568903723/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/the-pacific-symposium-on-biocomputing.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4042937297568903723?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4042937297568903723?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/the-pacific-symposium-on-biocomputing.html" title="The Pacific Symposium on Biocomputing 2013" /><author><name>Will</name><uri>http://www.blogger.com/profile/09703349044940180835</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="http://1.bp.blogspot.com/-DPV6nrTtGHY/UPTSZniomRI/AAAAAAAABRc/_nDd2s1gwko/s220/19bf088.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-Yzaa4dSZuOM/UPTQHo1W1FI/AAAAAAAABRE/fOyVhE52hEY/s72-c/DSC_0427.JPG" height="72" width="72" /><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;DkYNQXo7fSp7ImA9WhNUFks.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-7643768768878261319</id><published>2013-01-08T10:56:00.000-06:00</published><updated>2013-01-08T10:56:30.405-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-08T10:56:30.405-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Writing" /><category scheme="http://www.blogger.com/atom/ns#" term="Databases" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Apps" /><category scheme="http://www.blogger.com/atom/ns#" term="Software" /><title>Stop Hosting Data and Code on your Lab Website</title><content type="html">It's happened to all of us. You read about a new tool, database, webservice, software, or some interesting and useful data, but when you browse to &lt;span style="font-family: Courier New, Courier, monospace;"&gt;http://instititution.edu/~home/professorX/lab/data&lt;/span&gt;, there's no trace of what you were looking for.&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;THE PROBLEM&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
This isn't an uncommon problem. See the following two articles:&lt;br /&gt;
&lt;blockquote class="tr_bq"&gt;
Schultheiss, Sebastian J., et al. &lt;a href="http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0024914"&gt;"Persistence and availability of web services in computational biology."&lt;/a&gt; PLoS one 6.9 (2011): e24914.&amp;nbsp;&lt;/blockquote&gt;
&lt;blockquote class="tr_bq"&gt;
Wren, Jonathan D. &lt;a href="http://bioinformatics.oxfordjournals.org/content/20/5/668.long"&gt;"404 not found: the stability and persistence of URLs published in MEDLINE."&lt;/a&gt; Bioinformatics 20.5 (2004): 668-672.&lt;/blockquote&gt;
The first gives us some alarming statistics. In a survey of nearly 1000 web services published in the &lt;i&gt;Nucleic Acids Web Server Issue &lt;/i&gt;between 2003 and 2009:&lt;br /&gt;
&lt;ul&gt;
&lt;li&gt;Only 72% were still available at the published address.&lt;/li&gt;
&lt;li&gt;The authors could not test the functionality for 33% because there was no example data, and&amp;nbsp;13% no longer worked as expected.&lt;/li&gt;
&lt;li&gt;The authors could only confirm positive functionality for 45%.&lt;/li&gt;
&lt;li&gt;Only 274 of the 872 corresponding authors answered an email.&lt;/li&gt;
&lt;li&gt;Of these 78% said a service was developed by a student or temporary researcher, and many had no plan for maintenance after the researcher had moved on to a permanent position.&lt;/li&gt;
&lt;/ul&gt;
&lt;div&gt;
The Wren &lt;i&gt;et al.&lt;/i&gt; paper found that of 1630 URLs identified in Pubmed abstracts, only 63% were consistently available. That rate was far worse for anonymous login FTP sites (33%).&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;a href="http://blog.openhelix.eu/?p=12206"&gt;OpenHelix&lt;/a&gt;&amp;nbsp;recently started &lt;a href="http://www.biostars.org/p/44043/"&gt;this thread on Biostar&lt;/a&gt; as an obituary section for bioinformatics tools and resources that have vanished.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
It's a fact that most of us academics move around a fair amount. Often we may not deem a tool we developed or data we collected and released to be worth transporting and maintaining. After some grace period, the resource disappears without a trace.&amp;nbsp;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;b&gt;SOFTWARE&lt;/b&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
I won't spend much time here because most readers here are probably aware of source code repositories for hosting software projects. Unless you're not releasing the source code to your software (aside: starting an open-source project is a &lt;a href="http://sciencecodemanifesto.org/"&gt;way to stake a claim in a field&lt;/a&gt;, not a real risk for getting yourself scooped), I can think of no benefit for hosting your code on your lab website when there are plenty of better alternatives available, such as &lt;a href="http://sourceforge.net/"&gt;Sourceforge&lt;/a&gt;, &lt;a href="https://github.com/"&gt;GitHub&lt;/a&gt;, &lt;a href="http://code.google.com/"&gt;Google Code&lt;/a&gt;, and others. In addition to free project hosting, tools like these provide version control, wikis, bug trackers, mailing lists and other services to enable transparent and open development with the end result of a better product and higher visibility. For more tips on open scientific software development, see this short editorial in PLoS Comp Bio:&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Prlić A, Procter JB (2012) &lt;a href="http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1002802"&gt;Ten Simple Rules for the Open Development of Scientific Software. &lt;/a&gt;PLoS Comput Biol 8(12): e1002802.&amp;nbsp;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Casey Bergman recently &lt;a href="http://caseybergman.wordpress.com/2012/07/15/where-do-bioinformaticians-host-their-code/"&gt;analyzed where bioinformaticians are hosting their code&lt;/a&gt;, where he finds that the growth rate of Github is outpacing both Google Code and Sourceforge. Indeed, &lt;a href="https://github.com/blog/1359-the-octoverse-in-2012"&gt;Github hosts more repositories&lt;/a&gt; than there are &lt;a href="http://en.wikipedia.org/wiki/User:R._fiend/How_many_articles_does_Wikipedia_really_have%3F"&gt;articles in Wikipedia&lt;/a&gt;, and has an excellent &lt;a href="http://learn.github.com/p/intro.html"&gt;tutorial&lt;/a&gt; and &lt;a href="http://try.github.com/levels/1/challenges/1"&gt;interactive learning modules&lt;/a&gt; to help you learn how to use it. However, Bergman also points out &lt;a href="https://caseybergman.wordpress.com/2012/11/08/on-the-preservation-of-published-bioinformatics-code-on-github/"&gt;how easy it is to delete a repository from Github&lt;/a&gt; and Google Code, where repositories are published by &lt;i&gt;individuals&lt;/i&gt;&amp;nbsp;who hold the keys to preservation (as opposed to Sourceforge, where it is &lt;a href="http://sourceforge.net/apps/trac/sourceforge/wiki/Removing%20a%20project"&gt;extremely difficult to remove a project once it's been released&lt;/a&gt;).&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;b&gt;DATA, FIGURES, SLIDES, WEB SERVICES, OR ANYTHING ELSE&lt;/b&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;div&gt;
For everything else&amp;nbsp;there's &lt;a href="http://figshare.com/"&gt;Figshare&lt;/a&gt;. Figshare lets you host and publicly share &lt;i&gt;unlimited&lt;/i&gt;&amp;nbsp;data (or store data privately up to 1GB). The name suggests a site for sharing figures, but Figshare allows you to permanently store and share &lt;i&gt;any research object&lt;/i&gt;. That can be figures, slides, negative results, videos, datasets, or anything else. If you're running a database server or web service, you can package up the source code on one of the repositories mentioned above, and upload to Figshare a &lt;a href="https://www.virtualbox.org/"&gt;virtual machine image&lt;/a&gt; of the server running it, so that the service will be available to users long after you've lost the time, interest, or money to maintain it.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Research outputs stored at Figshare are archived in the &lt;a href="http://www.clockss.org/"&gt;CLOCKSS&lt;/a&gt; geographically and geopolitically distributed network of redundant archive nodes, located at 12 major research libraries around the world. This means that content will remain available indefinitely for everyone after a "trigger event," and ensures this work will be maximally accessible and useful over time. Figshare is hosted using Amazon Web Services to ensure the highest level of security and stability for research data.&amp;nbsp;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Upon uploading your data to Figshare, your data becomes discoverable, searchable, shareable, and instantly citable with its own DOI, allowing you to instantly take credit for the products of your research.&amp;nbsp;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
To show you how easy this is, I recently uploaded a list of "consensus" genes generated by Will Bush&amp;nbsp;where Ensembl refers to an Entrez-gene with the same coordinates, and that Entrez-gene entry refers back to the same Ensembl gene (&lt;a href="http://gettinggeneticsdone.blogspot.com/2011/06/mapping-snps-to-genes-for-gwas.html"&gt;discussed in more detail in this previous post&lt;/a&gt;).&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Create an account, and hit the big &lt;a href="http://figshare.com/account/upload"&gt;upload link&lt;/a&gt;. You'll be given a screen to drag and drop anything you'd like here (there's also a desktop uploader for larger files).&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;/div&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-ZmG2hw3xcbI/UNJAoaqf4PI/AAAAAAABCWg/Rl7DpoKZzdA/s1600/figshare1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="339" src="http://1.bp.blogspot.com/-ZmG2hw3xcbI/UNJAoaqf4PI/AAAAAAABCWg/Rl7DpoKZzdA/s640/figshare1.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Once I dropped in the data I downloaded from Vanderbilt's website linked from the original blog post, I enter some optional metadata, a description, a link back to the original post:&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-0FAxuk4FBKA/UNJApxaJiWI/AAAAAAABCWo/NcoVAHP9tIU/s1600/figshare2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="640" src="http://3.bp.blogspot.com/-0FAxuk4FBKA/UNJApxaJiWI/AAAAAAABCWo/NcoVAHP9tIU/s640/figshare2.png" width="578" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
I then instantly receive a citeable DOI where the data is stored permanently, regardless of Will's future at Vanderbilt:&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Ensembl/Entrez hg19/GRCh37 Consensus Genes. Stephen Turner. figshare.&amp;nbsp;Retrieved 21:31, Dec 19, 2012 (GMT).&amp;nbsp;&lt;a href="http://dx.doi.org/10.6084/m9.figshare.103113"&gt;http://dx.doi.org/10.6084/m9.figshare.103113&lt;/a&gt;&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
There are also links to the side that allow you to export that citation directly to your reference manager of choice.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Finally, as an experiment, I also uploaded this entire blog post to Figshare, which is now citeable and permanently archived at Figshare:&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;div&gt;
Stop Hosting Data and Code on your Lab Website. Stephen Turner. figshare.&amp;nbsp;Retrieved 22:51, Dec 19, 2012 (GMT).&amp;nbsp;&lt;a href="http://dx.doi.org/10.6084/m9.figshare.105125"&gt;http://dx.doi.org/10.6084/m9.figshare.105125&lt;/a&gt;.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/pJ5IZ9SI8z4" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/7643768768878261319/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/stop-hosting-data-and-code-on-your-lab.html#comment-form" title="8 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/7643768768878261319?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/7643768768878261319?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/stop-hosting-data-and-code-on-your-lab.html" title="Stop Hosting Data and Code on your Lab Website" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-ZmG2hw3xcbI/UNJAoaqf4PI/AAAAAAABCWg/Rl7DpoKZzdA/s72-c/figshare1.png" height="72" width="72" /><thr:total>8</thr:total></entry><entry gd:etag="W/&quot;C0cFQXc9eCp7ImA9WhNUE0w.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-207466413550064371</id><published>2013-01-04T08:50:00.000-06:00</published><updated>2013-01-04T08:50:10.960-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-04T08:50:10.960-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Metagenomics" /><category scheme="http://www.blogger.com/atom/ns#" term="Software" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Twitter Roundup, January 4 2013</title><content type="html">I've said it before: &lt;a href="https://twitter.com/genetics_blog"&gt;Twitter&lt;/a&gt; makes me a lazy blogger. Lots of stuff came across my radar this week that didn't make it into a full blog post. Here's a quick recap:&lt;br /&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;a href="http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1002826?utm_source=feedburner&amp;amp;utm_medium=feed&amp;amp;utm_campaign=Feed%3A+ploscompbiol%2FNewArticles+%28Ambra+-+Computational+Biology+New+Articles%29"&gt;PLOS Computational Biology: Chapter 1: Biomedical Knowledge Integration&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.nature.com/nbt/journal/v30/n11/full/nbt.2403.html"&gt;Assuring the quality of next-generation sequencing in clinical laboratory practice : Nature Biotechnology&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.nature.com/nmeth/journal/v9/n4/full/nmeth.1935.html"&gt;De novo genome assembly: what every biologist should know : Nature Methods&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/23270466"&gt;How deep is deep enough for RNA-Seq profiling of bacterial transcriptomes?&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.silencejournal.com/content/3/1/9/abstract"&gt;Silence | Abstract | Strand-specific libraries for high throughput RNA sequencing (RNA-Seq) prepared without poly(A) selection&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.biomedcentral.com/1471-2164/13/730/abstract"&gt;BMC Genomics | Abstract | Comparison of metagenomic samples using sequence signatures&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/23266983"&gt;Peak identification for ChIP-seq data with no controls.&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://nar.oxfordjournals.org/content/early/2012/12/18/nar.gks1311.long"&gt;TrueSight: a new algorithm for splice junction detection using RNA-seq&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.ncbi.nlm.nih.gov/pubmed/23246976"&gt;DiffCorr: An R package to analyze and visualize differential correlations in biological networks.&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0052210"&gt;PLOS ONE: Reevaluating Assembly Evaluations with Feature Response Curves: GAGE and Assemblathons&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.guardian.co.uk/global-development-professionals-network/2013/jan/03/public-health-genomics"&gt;Delivering the promise of public health genomics | Global Development Professionals Network&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.sciencedirect.com/science/article/pii/S0196439912000566"&gt;Metagenomics and Community Profiling: Culture-Independent Techniques in the Clinical Laboratory&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0052881"&gt;PLOS ONE: A Model-Based Clustering Method for Genomic Structural Variant Prediction and Genotyping Using Paired-End Sequencing Data&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="https://www.innocentive.com/ar/challenge/9933138"&gt;InnoCentive - Metagenomics Challenge&lt;/a&gt;&lt;/div&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/fTPJOIGdqKM" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/207466413550064371/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/twitter-roundup-january-4-2013.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/207466413550064371?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/207466413550064371?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/twitter-roundup-january-4-2013.html" title="Twitter Roundup, January 4 2013" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;CEADQHs9eSp7ImA9WhNUEU4.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-8470016976038228248</id><published>2013-01-02T07:18:00.002-06:00</published><updated>2013-01-02T07:19:31.561-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-02T07:19:31.561-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Writing" /><category scheme="http://www.blogger.com/atom/ns#" term="Tutorials" /><category scheme="http://www.blogger.com/atom/ns#" term="Software" /><category scheme="http://www.blogger.com/atom/ns#" term="Announcements" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><title>Computing for Data Analysis, and Other Free Courses</title><content type="html">Coursera's free &lt;a href="https://www.coursera.org/course/compdata"&gt;Computing for Data Analysis&lt;/a&gt; course starts today.&amp;nbsp;It's a four week long course, requiring about 3-5 hours/week. A bit about the course:&lt;br /&gt;
&lt;blockquote class="tr_bq"&gt;
&lt;i&gt;In this course you will learn how to program in R and how to use R for effective data analysis. You will learn how to install and configure software necessary for a statistical programming environment, discuss generic programming language concepts as they are implemented in a high-level statistical language. The course covers practical issues in statistical computing which includes programming in R, reading data into R, creating informative data graphics, accessing R packages, creating R packages with documentation, writing R functions, debugging, and organizing and commenting R code. Topics in statistical data analysis and optimization will provide working examples.&lt;/i&gt;&lt;/blockquote&gt;
There are also &lt;a href="https://www.coursera.org/courses"&gt;hundreds of other free courses&lt;/a&gt; scheduled for this year. While the Computing for Data Analysis course is more about using R, the &lt;a href="https://www.coursera.org/course/dataanalysis"&gt;Data Analysis&lt;/a&gt; course is more about the methods and experimental designs you'll use, with a smaller emphasis on the R language. There are also courses on&amp;nbsp;&lt;a href="https://www.coursera.org/course/scientificcomp"&gt;Scientific Computing&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/algo"&gt;Algorithms&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/healthinformatics"&gt;Health Informatics in the Cloud&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/nlangp"&gt;Natural Language Processing&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/datasci"&gt;Introduction to Data Science&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/sciwrite"&gt;Scientific Writing&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/neuralnets"&gt;Neural Networks&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/hetero"&gt;Parallel Programming&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/stats1"&gt;Statistics 101&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/netsysbio"&gt;Systems Biology&lt;/a&gt;,&amp;nbsp;&lt;a href="https://www.coursera.org/course/datamanagement"&gt;Data Management for Clinical Research&lt;/a&gt;, and many, many others. See the link below for the full listing.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="https://www.coursera.org/courses"&gt;Free Courses on Coursera&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/2kknGmYOHY4" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/8470016976038228248/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/computing-for-data-analysis-and-other.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8470016976038228248?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/8470016976038228248?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2013/01/computing-for-data-analysis-and-other.html" title="Computing for Data Analysis, and Other Free Courses" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>1</thr:total></entry><entry gd:etag="W/&quot;CUQGQ306cCp7ImA9WhNVGUo.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-6830600210143374334</id><published>2012-12-31T11:02:00.000-06:00</published><updated>2012-12-31T11:02:02.318-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-12-31T11:02:02.318-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="Visualization" /><category scheme="http://www.blogger.com/atom/ns#" term="Annotation" /><category scheme="http://www.blogger.com/atom/ns#" term="GWAS" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Getting Genetics Done 2012 In Review</title><content type="html">Here are links to all of this year's posts (excluding seminar/webinar announcements), with the most visited posts in &lt;b&gt;&lt;i&gt;bold italic&lt;/i&gt;&lt;/b&gt;. As always, you can &lt;a href="https://twitter.com/genetics_blog"&gt;follow me on Twitter&lt;/a&gt; for more frequent updates. Happy new year!&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/01/new-years-resolution-learn-how-to-code.html"&gt;New Year's Resolution: Learn How to Code&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/01/annotating-limma-results-with-gene.html"&gt;Annotating limma Results with Gene Names for Affy Microarrays&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/02/your-publications-with-pmcid-as-pubmed.html"&gt;Your Publications (with PMCID) as a PubMed Query&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/03/pathway-analysis-for-high-throughput.html"&gt;&lt;b&gt;&lt;i&gt;Pathway Analysis for High-Throughput Genomics Studies&lt;/i&gt;&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/03/find-xargs-like-boss.html"&gt;find | xargs ... Like a Boss&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/03/redesign-by-subtraction.html"&gt;Redesign by Subtraction&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/03/video-tip-convert-gene-ids-with-biomart.html"&gt;Video Tip: Convert Gene IDs with Biomart&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/04/rna-seq-methods-march-twitter-roundup.html"&gt;RNA-Seq Methods &amp;amp; March Twitter Roundup&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/04/awk-command-to-count-total-unique-and.html"&gt;Awk Command to Count Total, Unique, and the Most Abundant Read in a FASTQ file&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/05/video-tip-use-ensembl-biomart-to.html"&gt;Video Tip: Use Ensembl BioMart to Quickly Get Ortholog Information&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/05/stepping-outside-my-open-source-comfort.html"&gt;Stepping Outside My Open-Source Comfort Zone: A First Look at Golden Helix SVS&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/05/how-to-stay-current-in.html"&gt;&lt;b&gt;&lt;i&gt;How to Stay Current in Bioinformatics/Genomics&lt;/i&gt;&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/06/haploreg-database-for-functional.html"&gt;The HaploREG Database for Functional Annotation of SNPs&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/06/identifying-pathogens-in-sequencing.html"&gt;Identifying Pathogens in Sequencing Data&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/06/browsing-dbgap-results.html"&gt;Browsing dbGAP Results&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/07/fix-overplotting-with-colored-contour.html"&gt;&lt;b&gt;&lt;i&gt;Fix Overplotting with Colored Contour Lines&lt;/i&gt;&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/07/plotting-frequency-of-twitter-hashtag.html"&gt;Plotting the Frequency of Twitter Hashtag Usage Over Time with R and ggplot2&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/08/cscan-finding-gene-expression.html"&gt;Cscan: Finding Gene Expression Regulators with ENCODE ChIP-Seq Data&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/08/more-on-exploring-correlations-in-r.html"&gt;More on Exploring Correlations in R&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/09/deseq-vs-edger-comparison.html"&gt;&lt;b&gt;&lt;i&gt;DESeq vs edgeR Comparison&lt;/i&gt;&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/09/learn-r-and-python-and-have-fun-doing-it.html"&gt;Learn R and Python, and Have Fun Doing It&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/11/star-ultrafast-universal-rna-seq-aligner.html"&gt;STAR: ultrafast universal RNA-seq aligner&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/11/regulomedb-identify-dna-features-and.html"&gt;RegulomeDB: Identify DNA Features and Regulatory Elements in Non-Coding Regions&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/11/copy-text-to-local-clipboard-from.html"&gt;Copy Text to the Local Clipboard from a Remote SSH Session&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/12/differential-isoform-expression-cuffdiff2.html"&gt;&lt;b&gt;&lt;i&gt;Differential Isoform Expression With RNA-Seq: Are We Really There Yet?&lt;/i&gt;&lt;/b&gt;&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/muzWETFGdpg" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/6830600210143374334/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/12/getting-genetics-done-2012-in-review.html#comment-form" title="0 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6830600210143374334?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6830600210143374334?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/12/getting-genetics-done-2012-in-review.html" title="Getting Genetics Done 2012 In Review" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><thr:total>0</thr:total></entry><entry gd:etag="W/&quot;A0cBRXk6cSp7ImA9WhNbEUU.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-2023931904980280892</id><published>2012-12-17T15:23:00.000-06:00</published><updated>2013-01-14T12:44:14.719-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-14T12:44:14.719-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="RNA-Seq" /><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Differential Isoform Expression With RNA-Seq: Are We Really There Yet?</title><content type="html">In case you missed it, a new paper was published in Nature Biotechnology on a method for detecting isoform-level differential expression with RNA-seq Data:&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/full/nbt.2450.html"&gt;Trapnell, Cole, et al. "Differential analysis of gene regulation at transcript resolution with RNA-seq." Nature Biotechnology (2012).&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;b&gt;THE PROBLEM&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;div class="p1"&gt;
RNA-seq enables transcript-level resolution of gene expression, but there is no proven methodology for simultaneously accounting for biological variability across replicates and uncertainty in mapping fragments to isoforms. One of the most commonly used workflows is to map reads with a tool like Tophat or &lt;a href="http://gettinggeneticsdone.blogspot.com/2012/11/star-ultrafast-universal-rna-seq-aligner.html"&gt;STAR&lt;/a&gt;, use a tool like &lt;a href="http://www-huber.embl.de/users/anders/HTSeq/doc/count.html"&gt;HTSeq&lt;/a&gt; to count the number of reads overlapping a gene, then use a negative-binomial count-based approach such as &lt;a href="http://gettinggeneticsdone.blogspot.com/2012/09/deseq-vs-edger-comparison.html"&gt;edgeR or DESeq&lt;/a&gt; to assess differential expression at the gene level. &amp;nbsp;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/fig_tab/nbt.2450_F1.html"&gt;Figure 1 in the paper&lt;/a&gt; illustrates the problem with existing approaches, which only count the number of fragments originating from either the entire gene or constitutive exons only.&amp;nbsp;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style="text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-UN2mxaEvKV0/UM9-SZqwkHI/AAAAAAABCV8/8HGLF9_oZKE/s1600/cuffdiff1.png" imageanchor="1" style="margin-left: auto; margin-right: auto;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-UN2mxaEvKV0/UM9-SZqwkHI/AAAAAAABCV8/8HGLF9_oZKE/s1600/cuffdiff1.png" /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class="tr-caption" style="text-align: center;"&gt;Excerpt from &lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/fig_tab/nbt.2450_F1.html"&gt;figure 1&lt;/a&gt; from the &lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/full/nbt.2450.html"&gt;Cuffdiff 2 paper&lt;/a&gt;.&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
In the top row, a change in gene expression is undetectable by counting reads mapping to any exon, and is underestimated if counting only constitutive exons. In the middle row, an apparent change would be detected, but in the &lt;i&gt;wrong direction&lt;/i&gt;&amp;nbsp;if using a count-based method alone rather than accounting for which transcript a read comes from and how long that transcript is. How often situations like the middle row happen in reality, that's anyone's guess.&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;b&gt;THE PROPOSED SOLUTION&lt;/b&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
The method presented in this paper, popularized by the cuffdiff method in the &lt;a href="http://cufflinks.cbcb.umd.edu/"&gt;Cufflinks software package&lt;/a&gt;, claims to address both of these problems simultaneously by modeling variability in the number of fragments generated by each transcript across biological replicates using a beta negative binomial mixture distribution that accounts for both sources of variability in a transcript's measured expression level. This so-called transcript deconvolution is &lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/extref/nbt.2450-S1.pdf"&gt;not computationally trivial&lt;/a&gt;, and &lt;a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3146043/figure/F1/"&gt;incredibly difficult to explain&lt;/a&gt;, but failure to account for the uncertainty (measurement error) from which transcript a fragment originates from can result in a high false-positive rate, especially when there is significant differential regulation of isoforms.&amp;nbsp;Compared to existing methods, the procedure described claims equivalent sensitivity with a much lower false-positive rate when there is substantial isoform-level variability in gene expression between conditions.&amp;nbsp;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;b&gt;ALTERNATIVE WORKFLOWS&lt;/b&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;b&gt;&lt;br /&gt;&lt;/b&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
Importantly, the manuscript also addresses and points out weaknesses several undocumented "alternative" workflows that are discussed often on forums like &lt;a href="http://seqanswers.com/"&gt;SEQanswers&lt;/a&gt;&amp;nbsp;and anecdotally at meetings. These alternative workflows are variations on a theme: combining transcript-level fragment count estimates (like estimates from Cufflinks, &lt;a href="http://bio.math.berkeley.edu/eXpress/overview.html"&gt;eXpress&lt;/a&gt;, or&amp;nbsp;&lt;a href="http://deweylab.biostat.wisc.edu/rsem/"&gt;RSEM&lt;/a&gt;&amp;nbsp;mapping to a transcriptome), with downstream count-based analysis tools like &lt;a href="http://gettinggeneticsdone.blogspot.com/2012/09/deseq-vs-edger-comparison.html"&gt;edgeR/DESeq&lt;/a&gt;&amp;nbsp;(both R/Bioconductor packages). This paper points out that none of these tools were meant to be used this way, and doing so violates assumptions of underlying statistics used by &amp;nbsp;both procedures. However, the authors concede that the variance modeling strategies of edgeR and DESeq are robust, and thus assessed the performance of these "alternative" workflows. The results of those experiments show that the algorithm presented in this paper, cuffdiff 2, outperforms other alternative hybrid Cufflinks/RSEM + edgeR/DESeq workflows [see &lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/extref/nbt.2450-S1.pdf"&gt;supplementary figure 77&lt;/a&gt; (&lt;i&gt;yes, 77&lt;/i&gt;!]).&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;b&gt;REPRODUCIBILITY ISSUES&lt;/b&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
In theory (and in the simulation studies presented here, see further comments below), the methodology presented here seems to outperform any other competing workflow. So why isn't everyone using it, and why is there so much &lt;a href="http://seqanswers.com/forums/showthread.php?t=20702"&gt;grumbling&lt;/a&gt; about it on forums and at meetings? For many (myself included), the biggest issue is one of reproducibility. There are many discussions about cufflinks/cuffdiff providing drastically different results from one version to the next (see &lt;a href="http://seqanswers.com/forums/showthread.php?t=17662"&gt;here&lt;/a&gt;, &lt;a href="http://seqanswers.com/forums/showthread.php?t=23962"&gt;here&lt;/a&gt;, &lt;a href="http://seqanswers.com/forums/showthread.php?t=21020"&gt;here&lt;/a&gt;,&amp;nbsp;&lt;a href="http://seqanswers.com/forums/showthread.php?t=21708"&gt;here&lt;/a&gt;, and &lt;a href="http://www.biostars.org/p/6317/"&gt;here&lt;/a&gt;, for a start). The core I run operates in a production environment where everything I do &lt;i&gt;must be absolutely transparent and reproducible&lt;/i&gt;. Reporting drastically different results to my collaborators whenever I update the tools I'm using is very alarming to a biologist, and reduces their confidence in the service I provide and the tools I use.&amp;nbsp;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
Furthermore, a &lt;a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3460195/"&gt;recent methods paper&lt;/a&gt; recently compared their tool, &lt;a href="http://bioconductor.org/packages/release/bioc/html/DEXSeq.html"&gt;DEXSeq&lt;/a&gt;, to several different versions of cuffdiff. Here, the authors performed two comparisons: a "proper" comparison, where replicates of treatments (T1-T3) were compared to replicates of controls (C1-C4), and a "mock" comparison, where controls (e.g. C1+C3) were compared to other controls (C2+C4). The most haunting result is shown below, where the "proper" comparison finds relatively few differentially expressed genes, while the "mock" comparison of controls versus other controls finds many, many more differentially expressed genes, and an increasing number with newer versions of cufflinks:&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style="text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-1kbM7sYHSho/UM-IfRPmYcI/AAAAAAABCWM/SgFlgg_azqA/s1600/dexseqcomparison.png" imageanchor="1" style="margin-left: auto; margin-right: auto;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-1kbM7sYHSho/UM-IfRPmYcI/AAAAAAABCWM/SgFlgg_azqA/s1600/dexseqcomparison.png" /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class="tr-caption" style="text-align: center;"&gt;Table S1 from the &lt;a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3460195/"&gt;DEXSeq paper&lt;/a&gt;.&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;div class="p1"&gt;
This comparison predates the release of Cuffdiff 2, so perhaps this alarming trend ceases with the newer release of Cufflinks. However, it is worth noting that these data shown here are from a&lt;i&gt;&amp;nbsp;&lt;/i&gt;&lt;a href="http://genome.cshlp.org/content/21/2/193"&gt;&lt;i&gt;real dataset&lt;/i&gt;&lt;/a&gt;&lt;i&gt;, &lt;/i&gt;where all the comparisons in the new Cuffdiff 2 paper were done with simulations. Having done some method development myself, I realize how easy it is to construct a simulation scenario to support nearly any claim you'd like to make.&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;b&gt;FINAL THOUGHTS&lt;/b&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
Most RNA-seq folks would say that the field has a good handle on differential expression at the gene level, while differential expression at isoform-level resolution is still under development. I would tend to agree with this statement, but if cases as presented in &lt;a href="http://www.nature.com/nbt/journal/vaop/ncurrent/fig_tab/nbt.2450_F1.html"&gt;Figure 1&lt;/a&gt; of this paper are&amp;nbsp;biologically important and&amp;nbsp;widespread (&lt;a href="http://www.nature.com/nature/journal/v489/n7414/full/nature11233.html"&gt;they very well may be&lt;/a&gt;), then perhaps we have some re-thinking to do, even with what we thought were "simple" analyses at the gene level.&amp;nbsp;&lt;/div&gt;
&lt;div class="p1"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="p1"&gt;
What's your workflow for RNA-seq analysis? Discuss.&lt;/div&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/SFitmGdm_8s" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/2023931904980280892/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/12/differential-isoform-expression-cuffdiff2.html#comment-form" title="24 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/2023931904980280892?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/2023931904980280892?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/12/differential-isoform-expression-cuffdiff2.html" title="Differential Isoform Expression With RNA-Seq: Are We Really There Yet?" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-UN2mxaEvKV0/UM9-SZqwkHI/AAAAAAABCV8/8HGLF9_oZKE/s72-c/cuffdiff1.png" height="72" width="72" /><thr:total>24</thr:total></entry><entry gd:etag="W/&quot;D0MAQH8_fCp7ImA9WhNbEkg.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-4069672491787383673</id><published>2012-11-26T13:22:00.000-06:00</published><updated>2013-01-15T07:10:41.144-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-15T07:10:41.144-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Productivity" /><title>Copy Text to the Local Clipboard from a Remote SSH Session</title><content type="html">This is an issue that has bugged me for years, and I've finally found a good solution on &lt;a href="http://osxdaily.com/2011/05/05/transfer-clipboard-text-source-code-between-macs-with-ssh/"&gt;osxdaily&lt;/a&gt; and &lt;a href="http://stackoverflow.com/questions/1152362/getting-items-on-the-local-clipboard-from-a-remote-ssh-session"&gt;Stack Overflow&lt;/a&gt;. I'm using Terminal on my OSX laptop to connect to a headless Linux machine over SSH, and I want to copy the output from a command (on the remote server) to the local clipboard on my Mac using only the keyboard (i.e., a pipe). In essence:&lt;br /&gt;
&lt;br /&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;commandThatMakesOutput | sendToLocalClipboard&lt;/span&gt;&lt;br /&gt;
&lt;br /&gt;
If I'm working from the command line on my local machine, the &lt;a href="http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man1/pbcopy.1.html"&gt;pbcopy&lt;/a&gt; command does this for me. For example, while working on my local mac, if I wanted to copy all the filenames in the directory to my clipboard, I could use:&lt;br /&gt;
&lt;br /&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;ls | pbcopy&lt;/span&gt;&lt;br /&gt;
&lt;br /&gt;
To do this for output generated over SSH, the general solution works like this:&lt;br /&gt;
&lt;br /&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;commandThatMakesOutput | ssh desktop pbcopy&lt;/span&gt;&lt;br /&gt;
&lt;br /&gt;
When you're SSH'd into a remote host, this will take the output of &lt;span style="font-family: Courier New, Courier, monospace;"&gt;commandThatMakesOutput&lt;/span&gt; and pipe it to the &lt;span style="font-family: Courier New, Courier, monospace;"&gt;pbcopy&lt;/span&gt; command of the &lt;i&gt;desktop&lt;/i&gt; machine that you &lt;span style="font-family: Courier New, Courier, monospace;"&gt;ssh&lt;/span&gt; into &lt;i&gt;from the remote host&lt;/i&gt;. In other words, &lt;i&gt;you ssh back into your local machine&lt;/i&gt; from the remote host, passing output from&amp;nbsp;&lt;span style="font-family: 'Courier New', Courier, monospace;"&gt;commandThatMakesOutput&lt;/span&gt;&amp;nbsp;back to &lt;span style="font-family: Courier New, Courier, monospace;"&gt;pbcopy&lt;/span&gt; on your local machine.&lt;br /&gt;
&lt;br /&gt;
There is one required step and three recommendations.&lt;br /&gt;
&lt;br /&gt;
First, you must configure your local desktop as an SSH server. I'm assuming you're aware of any security risks you might be taking when doing this. Open system preferences, sharing. Enable remote login, preferably allowing &lt;i&gt;only you&lt;/i&gt;&amp;nbsp;to log in, for added security. You'll set up SSH keys later.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-xcWaVN3YGBA/UJl2xobITKI/AAAAAAABB3k/EieRT2vq2iY/s1600/sshmac.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-xcWaVN3YGBA/UJl2xobITKI/AAAAAAABB3k/EieRT2vq2iY/s1600/sshmac.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
Here, take note of how to SSH into your machine. I've blurred out my IP address where it says "To log into this computer..." This brings up the first recommendation: you need a static IP address for this to work well, otherwise you'll have to constantly look up your IP address and modify the command you'll use to SSH back into your local machine from the remote host. Most institutions simply hand out static IPs if you ask nicely.&amp;nbsp;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
The second recommendation is that you create an SSH key pair, storing the public key on your mac in ~/.ssh/authorized_keys and your private key on the remote host, so that you don't have to type a password. Make sure you &lt;span style="font-family: Courier New, Courier, monospace;"&gt;chmod&lt;/span&gt; these files appropriately.&amp;nbsp;&lt;a href="http://lmgtfy.com/?q=ssh+keys"&gt;I'll leave this up to you and Google&lt;/a&gt;.&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
Finally, you want to alias the command on your remote host to some quick command, like "cb". Preferably you'll add this to your .bashrc, so it's sourced every time you log in. E.g.&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;alias cb="ssh user@123.456.7.8 pbcopy"&lt;/span&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
So now, if you're logged into a remote host and you want to copy the output of pwd (on the remote host) to your local clipboard, you can simply use:&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;span style="font-family: Courier New, Courier, monospace;"&gt;pwd | cb&lt;/span&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: left;"&gt;
I don't really know what this would do with anything other than small bits of ASCII text, so if you pipe binary, or the output of &lt;span style="font-family: Courier New, Courier, monospace;"&gt;gzip -c&lt;/span&gt; to pbpaste, you're voiding your nonexistent warranty. Presumably there's a way to do this with Windows - I don't know of a command-line utility to give you access to the clipboard, but if you can think of a way how, please leave a comment.&lt;/div&gt;
&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/1dTH-sOXyRc" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/4069672491787383673/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/11/copy-text-to-local-clipboard-from.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4069672491787383673?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4069672491787383673?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/11/copy-text-to-local-clipboard-from.html" title="Copy Text to the Local Clipboard from a Remote SSH Session" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://2.bp.blogspot.com/-xcWaVN3YGBA/UJl2xobITKI/AAAAAAABB3k/EieRT2vq2iY/s72-c/sshmac.png" height="72" width="72" /><thr:total>1</thr:total></entry><entry gd:etag="W/&quot;DUQGQ3wzfCp7ImA9WhNRE00.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-4951002562518642842</id><published>2012-11-07T10:54:00.001-06:00</published><updated>2012-11-07T10:55:22.284-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-11-07T10:55:22.284-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Search" /><category scheme="http://www.blogger.com/atom/ns#" term="ENCODE" /><category scheme="http://www.blogger.com/atom/ns#" term="Databases" /><category scheme="http://www.blogger.com/atom/ns#" term="Annotation" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>RegulomeDB: Identify DNA Features and Regulatory Elements in Non-Coding Regions</title><content type="html">Many papers have noted the challenges associated with assigning function to non-coding genetic variation, and since the majority of GWAS hits for common traits are non-coding, resources for providing some mechanism for these associations are desperately needed.
&lt;br /&gt;
&lt;br /&gt;
Boyle and colleagues have constructed a database called &lt;a href="http://regulome.stanford.edu/"&gt;RegulomeDB&lt;/a&gt; to provide functional assignments to variants using data from manual curation, CHiP-seq data, chromatin state information, eQTLs across multiple cell lines, and some computational predictions generated from DNase footprinting and transcription factor binding motifs.
&lt;br /&gt;
&lt;br /&gt;
RegulomeDB implements a tiered category system (1-6) where category 1 has an eQTL association in addition to other ENCODE sources of data, 2 -5 have some ENCODE data only with no eQTL associations, and category 6 has evidence of a binding motif change only.  As you might imagine, the annotation density increases as you increase category numbers.
&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-GhWR_GuJd50/UIwuu3CTqdI/AAAAAAAABQQ/APmc65Nzj_Y/s1600/regulome_screenshot.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="206" src="http://1.bp.blogspot.com/-GhWR_GuJd50/UIwuu3CTqdI/AAAAAAAABQQ/APmc65Nzj_Y/s320/regulome_screenshot.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
Their simple, but impressive interface will accept RS numbers, or whole BED, GFF, or VCF files for annotation.  The resulting output (example above) is downloadable, providing both specifics of the annotation (such as the transcription factor binding to the area) and the functional score for the variant.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://regulome.stanford.edu/"&gt;http://regulome.stanford.edu/&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/b60G12UIGcU" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/4951002562518642842/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/11/regulomedb-identify-dna-features-and.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4951002562518642842?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4951002562518642842?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/11/regulomedb-identify-dna-features-and.html" title="RegulomeDB: Identify DNA Features and Regulatory Elements in Non-Coding Regions" /><author><name>Will</name><uri>http://www.blogger.com/profile/09703349044940180835</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="http://1.bp.blogspot.com/-DPV6nrTtGHY/UPTSZniomRI/AAAAAAAABRc/_nDd2s1gwko/s220/19bf088.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-GhWR_GuJd50/UIwuu3CTqdI/AAAAAAAABQQ/APmc65Nzj_Y/s72-c/regulome_screenshot.png" height="72" width="72" /><thr:total>1</thr:total></entry><entry gd:etag="W/&quot;A0cBRXk5fyp7ImA9WhNbEUU.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-6841821543408693029</id><published>2012-11-02T11:45:00.000-05:00</published><updated>2013-01-14T12:44:14.727-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-14T12:44:14.727-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="RNA-Seq" /><category scheme="http://www.blogger.com/atom/ns#" term="Software" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>STAR: ultrafast universal RNA-seq aligner</title><content type="html">There's a new kid on the block for RNA-seq alignment.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract"&gt;Dobin, Alexander, et al. "STAR: ultrafast universal RNA-seq aligner." Bioinformatics (2012).&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
Aligning RNA-seq data is challenging because reads can overlap splice junctions. Many other RNA-seq alignment algorithms (e.g. &lt;a href="http://tophat.cbcb.umd.edu/"&gt;Tophat&lt;/a&gt;) are built on top of DNA sequence aligners. &lt;a href="http://code.google.com/p/rna-star/"&gt;STAR&lt;/a&gt; (Spliced Transcripts Alignment to a Reference) is a standalone RNA-seq alignment algorithm that uses uncompressed suffix arrays and a mapping algorithm similar to those used in large-scale genome alignment tools to align RNA-seq reads to a genomic reference. STAR is over 50 times faster than any other previously published RNA-seq aligner, and outperforms other aligners in both sensitivity and specificity using both simulated and real (replicated) RNA-seq data.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-5WTELB6A4JQ/UJP22me-QvI/AAAAAAABB3M/lEk0FNWnPcs/s1600/staraligner.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;br /&gt;
The notable increase in speed comes at the price of a larger memory requirement. STAR requires ~27GB RAM to align reads to a human genome - a moderate amount, but not atypical on most modern servers. STAR aligns ~45 million paired reads per hour per processor, and scales nearly linearly with the number of processors (without appreciably increasing RAM usage). Notably, the STAR algorithm is also capable of handling longer reads such as those from PacBio and the upcoming Oxford Nanopore technologies. STAR is free and open source software.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract"&gt;Dobin, Alexander, et al. "STAR: ultrafast universal RNA-seq aligner." Bioinformatics (2012).&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://code.google.com/p/rna-star/"&gt;STAR software on Google Code&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
(This post adapted from &lt;a href="http://dx.doi.org/10.3410/f.717961569.793464455"&gt;my review on F1000&lt;/a&gt;).&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/KZ1jA8euJu4" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/6841821543408693029/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/11/star-ultrafast-universal-rna-seq-aligner.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6841821543408693029?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/6841821543408693029?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/11/star-ultrafast-universal-rna-seq-aligner.html" title="STAR: ultrafast universal RNA-seq aligner" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-5WTELB6A4JQ/UJP22me-QvI/AAAAAAABB3M/lEk0FNWnPcs/s72-c/staraligner.png" height="72" width="72" /><thr:total>1</thr:total></entry><entry gd:etag="W/&quot;DkABQHg7eSp7ImA9WhJbFU4.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-3394391914899318887</id><published>2012-09-24T19:59:00.000-05:00</published><updated>2012-09-24T19:59:11.601-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-09-24T19:59:11.601-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Tutorials" /><category scheme="http://www.blogger.com/atom/ns#" term="Python" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Learn R and Python, and Have Fun Doing It</title><content type="html">If you need to catch up on all those years you spent &lt;i&gt;not&lt;/i&gt;&amp;nbsp;learning how to code (&lt;a href="http://gettinggeneticsdone.blogspot.com/2012/01/new-years-resolution-learn-how-to-code.html"&gt;you &lt;i&gt;need&lt;/i&gt; to know how to code&lt;/a&gt;), here are a few resources to help you quickly learn R and Python, and have a little fun doing it.&lt;br /&gt;
&lt;br /&gt;
First, the free online Coursera course &lt;i&gt;&lt;a href="https://www.coursera.org/course/compdata"&gt;Computing for Data Analysis&lt;/a&gt;&lt;/i&gt; just started. The 4 week course is being taught by &lt;a href="http://www.biostat.jhsph.edu/~rpeng/"&gt;Roger Peng&lt;/a&gt;, associate professor of biostatistics at Johns Hopkins, and blogger at &lt;a href="http://simplystatistics.org/"&gt;Simply Statistics&lt;/a&gt;. From the course description:&lt;br /&gt;
&lt;blockquote class="tr_bq"&gt;
&lt;i&gt;This course is about learning the fundamental computing skills necessary for effective data analysis. You will learn to program in R and to use R for reading data, writing functions, making informative graphs, and applying modern statistical methods.&lt;/i&gt;&lt;/blockquote&gt;
Here's a short video about the course from the instructor:&lt;br /&gt;
&lt;br /&gt;
&lt;iframe allowfullscreen="allowfullscreen" frameborder="0" height="338" src="http://www.youtube.com/embed/gk6E57H6mTs?rel=0" width="600"&gt;&lt;/iframe&gt;&lt;br /&gt;
&lt;br /&gt;
Next, for quickly learning Python, there's the &lt;a href="http://www.codecademy.com/tracks/python"&gt;Python track on Codeacademy&lt;/a&gt;. Codeacademy takes an interactive approach to teaching coding. The interface gives you some basic instruction and prompts you to enter short code snippets to accomplish a task. Codeacademy makes learning to code fun by giving you short projects to complete (e.g. a tip calculator), and rewarding you with badges for your accomplishments, which allow you to "compete" with friends.&lt;br /&gt;
&lt;br /&gt;
Once you've learned some basic skills, you really only get better with practice and problem solving. &lt;a href="http://projecteuler.net/"&gt;Project Euler&lt;/a&gt; has been around for some time, and you can find many solutions out there on the web using many different languages, but the problems are more purely mathematical in nature. For short problems perhaps more relevant, head over to &lt;a href="http://rosalind.info/problems/as-table/"&gt;Rosalind.info&lt;/a&gt; for some bioinformatics programming challenges ranging from something as simple as &lt;a href="http://rosalind.info/problems/dna/"&gt;counting nucleotides&lt;/a&gt; or &lt;a href="http://rosalind.info/problems/gc/"&gt;computing GC content&lt;/a&gt;, to something more difficult, such as &lt;a href="http://rosalind.info/problems/gasm/"&gt;genome assembly&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
&lt;a href="https://www.coursera.org/course/compdata"&gt;Coursera: Computing for Data Analysis&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://www.codecademy.com/tracks/python"&gt;Codeacademy Python Track&lt;/a&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://rosalind.info/"&gt;Rosalind.info bioinformatics programming challenges&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/cbibB94J3ZE" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/3394391914899318887/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/09/learn-r-and-python-and-have-fun-doing-it.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/3394391914899318887?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/3394391914899318887?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/09/learn-r-and-python-and-have-fun-doing-it.html" title="Learn R and Python, and Have Fun Doing It" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://img.youtube.com/vi/gk6E57H6mTs/default.jpg" height="72" width="72" /><thr:total>1</thr:total></entry><entry gd:etag="W/&quot;A0cBRXk5fSp7ImA9WhNbEUU.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-3718639602353197277</id><published>2012-09-18T15:08:00.000-05:00</published><updated>2013-01-14T12:44:14.725-06:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2013-01-14T12:44:14.725-06:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="RNA-Seq" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>DESeq vs edgeR Comparison</title><content type="html">&lt;i&gt;Update (Dec 18, 2012): Please see &lt;a href="http://gettinggeneticsdone.blogspot.com/2012/12/differential-isoform-expression-cuffdiff2.html"&gt;this related post&lt;/a&gt; I wrote about differential isoform expression analysis with &lt;a href="http://gettinggeneticsdone.blogspot.com/2012/12/differential-isoform-expression-cuffdiff2.html"&gt;Cuffdiff 2&lt;/a&gt;.&lt;/i&gt;&lt;br /&gt;
&lt;br /&gt;
&lt;a href="http://genomebiology.com/2010/11/10/R106"&gt;DESeq&lt;/a&gt; and &lt;a href="http://bioinformatics.oxfordjournals.org/content/26/1/139.long"&gt;edgeR&lt;/a&gt; are two methods and R packages for analyzing quantitative readouts (in the form of counts) from high-throughput experiments such as RNA-seq or ChIP-seq. After alignment, reads are assigned to a feature, where each feature represents a target transcript, in the case of RNA-Seq, or a binding region, in the case of ChIP-Seq. An important summary statistic is the &lt;i&gt;count&lt;/i&gt;&amp;nbsp;of the&amp;nbsp;number of reads in a feature (&lt;a href="http://www.nature.com/nmeth/journal/v5/n7/full/nmeth.1226.html"&gt;for RNA-Seq, this read count is a good approximation of transcript abundance&lt;/a&gt;).&lt;br /&gt;
&lt;br /&gt;
Methods used to analyze array-based data assume a normally distributed, &lt;i&gt;continuous&lt;/i&gt; response variable. However, response variables for digital methods like RNA-seq and ChIP-seq are &lt;i&gt;discrete counts&lt;/i&gt;. Thus, both DESeq and edgeR methods are based on the negative binomial distribution.&lt;br /&gt;
&lt;br /&gt;
I see these two tools often used interchangeably, and I wanted to take a look at how they stack up to one another in terms of performance, ease of use, and speed. This isn't meant to be a comprehensive evaluation or "bake-off" between the two methods. This would require complex simulations, parameter sweeps, and evaluation with multiple well-characterized real RNA-seq datasets. Further, this is only a start - a full evaluation would need to be much more comprehensive.&lt;br /&gt;
&lt;br /&gt;
Here, I used the newest versions of both &lt;a href="http://bioconductor.org/packages/devel/bioc/html/edgeR.html"&gt;edgeR&lt;/a&gt; and &lt;a href="http://bioconductor.org/packages/devel/bioc/html/DESeq.html"&gt;DESeq&lt;/a&gt;, using the well-characterized &lt;a href="http://genome.cshlp.org/content/early/2010/10/04/gr.108662.110"&gt;Pasilla dataset&lt;/a&gt;, available in the &lt;a href="http://bioconductor.org/packages/release/data/experiment/html/pasilla.html"&gt;pasilla&lt;/a&gt; Bioconductor package. The dataset is from an experiment in &lt;i&gt;Drosophila&lt;/i&gt;&amp;nbsp;investigating the effect of RNAi knockdown of the splicing factor, &lt;i&gt;pasilla&lt;/i&gt;. I used the GLM functionality of both packages, as recommended by the vignettes, for dealing with a multifactorial experiment (condition: treated vs. untreated; library type: single-end and paired-end).&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/3745236.js?file=deseq-vs-edger.R"&gt;&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
Both packages provide built-in functions for assessing overall similarity between samples using either PCA (DESeq) or MDS (edgeR), although these methods operate on the same underlying data and could easily be switched.&lt;br /&gt;
&lt;br /&gt;
PCA plot on variance stabilized data from DESeq:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-q0tPhin3wqY/UFjKtbQHWlI/AAAAAAABBOc/sHJLQpoijQI/s1600/deseq-pca.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/-q0tPhin3wqY/UFjKtbQHWlI/AAAAAAABBOc/sHJLQpoijQI/s1600/deseq-pca.png" /&gt;&lt;/a&gt;&lt;/div&gt;
MDS plot from edgeR:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-zZ99j-bXyw4/UFjKupah6gI/AAAAAAABBO0/PK1Rk9j15aE/s1600/edger-mds.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-zZ99j-bXyw4/UFjKupah6gI/AAAAAAABBO0/PK1Rk9j15aE/s1600/edger-mds.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Per gene dispersion estimates from DESeq:&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-KQu1yBBn9As/UFjKtIahRdI/AAAAAAABBOU/lIFVMShygf4/s1600/deseq-dispersion.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/-KQu1yBBn9As/UFjKtIahRdI/AAAAAAABBOU/lIFVMShygf4/s1600/deseq-dispersion.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Biological coefficient of variation versus abundance (edgeR):&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://4.bp.blogspot.com/-8QNCpJozcgU/UFjKt7seo4I/AAAAAAABBOk/_o8HhuZBKlk/s1600/edger-bcv.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/-8QNCpJozcgU/UFjKt7seo4I/AAAAAAABBOk/_o8HhuZBKlk/s1600/edger-bcv.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
Now, let's see how many statistically significant (FDR&amp;lt;0.05) results each method returns:&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/3745338.js?file=deseq-vs-edger-output1.R"&gt;&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
In this simple example, DESeq finds 820 genes significantly differentially expressed at FDR&amp;lt;0.05, while edgeR is finds these 820 and an additional 371. Let's take a look at the detected fold changes from both methods:&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-lFaRP3mimZg/UFjKvqvJqmI/AAAAAAABBPE/N6x-l1bMjRs/s1600/fc-comparison-nofilter.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-lFaRP3mimZg/UFjKvqvJqmI/AAAAAAABBPE/N6x-l1bMjRs/s1600/fc-comparison-nofilter.png" /&gt;&lt;/a&gt;&lt;/div&gt;
Here, if genes were found differentially expressed by edgeR only, they're colored red; if found by both, colored green. What's striking here is that for a handful of genes, DESeq is (1) reporting massive fold changes, and (2) not calling them statistically significant. What's going on here?&lt;br /&gt;
&lt;br /&gt;
It turns out that these genes have extremely low counts (usually one or two counts in only one or two samples). The &lt;a href="http://bioconductor.org/packages/devel/bioc/vignettes/DESeq/inst/doc/DESeq.pdf"&gt;DESeq vignette&lt;/a&gt; goes through the logic of independent filtering, showing that the likelihood of a gene being significantly differentially expressed is related to how strongly it's expressed, and advocates for discarding extremely lowly expressed genes, because differential expression is likely not statistically detectable.&lt;br /&gt;
&lt;br /&gt;
Count-based filtering can be achieved two ways. The DESeq vignette demonstrates how to filter based on quantiles, while I used the filtering method demonstrated in the edgeR vignette - removing genes without at least 2 counts per million in at least two samples. This filtering code is commented out above - uncomment to filter.&lt;br /&gt;
&lt;br /&gt;
After filtering, all of the genes shown above with apparently large fold changes as detected by DESeq are removed prior to filtering, and the fold changes correlate much better between the two methods. edgeR still detects ~50% more differentially expressed genes, and it's unclear to me (1) why this is the case, and (2) if this is necessarily a good thing.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-GD_DzgSy5HU/UFjKvJPdBBI/AAAAAAABBO8/v5VyjzgodQM/s1600/fc-comparison-filter.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-GD_DzgSy5HU/UFjKvJPdBBI/AAAAAAABBO8/v5VyjzgodQM/s1600/fc-comparison-filter.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;b&gt;Conclusions:&lt;/b&gt;&lt;br /&gt;
&lt;br /&gt;
Unfortunately, I may have oversold the title here - this is such a cursory comparison of the two methods that I would hesitate to draw any conclusions about which method is &lt;i&gt;better&lt;/i&gt;&amp;nbsp;than the other. In addition to finding more significantly differentially expressed genes (again, not necessarily a good thing), I can say that edgeR was much &lt;i&gt;faster&lt;/i&gt;&amp;nbsp;than DESeq for fitting GLM models, but it took slightly longer to estimate the dispersion. Further without any independent filtering, edgeR gave me moderated fold changes for the extremely lowly expressed genes for which DESeq returned logFCs in the 20-30 range (but these transcripts were so lowly expressed anyway, they should have been filtered out before any evaluation).&lt;br /&gt;
&lt;br /&gt;
If there's one thing that will make me use edgeR over DESeq (until I have time to do a more thorough evaluation), it's the fact that &lt;i&gt;using&lt;/i&gt;&amp;nbsp;edgeR seems much more natural than DESeq, especially if you're familiar with the &lt;a href="http://bioconductor.org/packages/release/bioc/html/limma.html"&gt;limma package&lt;/a&gt; (pretty much the standard for analyzing microarray data and other continuously distributed gene expression data). Setting up the design matrix and specifying contrasts feels natural if you're familiar with using limma. Further, the &lt;a href="http://bioconductor.org/packages/devel/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf"&gt;edgeR user guide&lt;/a&gt; weighs in at 67 pages, filled with many case studies that will help you in putting together a design matrix for nearly any experimental design: paired designs, time courses, batch effects, interactions, etc. The &lt;a href="http://bioconductor.org/packages/devel/bioc/vignettes/DESeq/inst/doc/DESeq.pdf"&gt;DESeq documentation&lt;/a&gt; is still fantastic, but could benefit from a few more case studies / examples.&lt;br /&gt;
&lt;br /&gt;
What do you think? Anyone want to &lt;a href="https://help.github.com/articles/fork-a-repo"&gt;fork&lt;/a&gt; my &lt;a href="https://gist.github.com/3745236"&gt;R code&lt;/a&gt; and help do this comparison more comprehensively (more examples, simulated data, speed benchmarking)? Is the analysis above fair? What do you find more easy to use, or is ease-of-use (and thus, reproducibility) even important when considering data analysis?&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/CAi8VFFiE9o" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/3718639602353197277/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/09/deseq-vs-edger-comparison.html#comment-form" title="18 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/3718639602353197277?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/3718639602353197277?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/09/deseq-vs-edger-comparison.html" title="DESeq vs edgeR Comparison" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://4.bp.blogspot.com/-q0tPhin3wqY/UFjKtbQHWlI/AAAAAAABBOc/sHJLQpoijQI/s72-c/deseq-pca.png" height="72" width="72" /><thr:total>18</thr:total></entry><entry gd:etag="W/&quot;CEMDQHgzcCp7ImA9WhJVEUs.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-4645337786345003486</id><published>2012-08-28T09:01:00.001-05:00</published><updated>2012-08-28T09:01:11.688-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-08-28T09:01:11.688-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Statistics" /><category scheme="http://www.blogger.com/atom/ns#" term="Visualization" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><title>More on Exploring Correlations in R</title><content type="html">About a year ago I wrote a post about producing &lt;a href="http://gettinggeneticsdone.blogspot.com/2011/07/scatterplot-matrices-in-r.html"&gt;scatterplot matrices in R&lt;/a&gt;. These are handy for quickly getting a sense of the correlations that exist in your data. Recently someone asked me to pull out some relevant statistics (correlation coefficient and p-value) into tabular format to publish beside a scatterplot matrix. The built-in &lt;span style="font-family: Courier New, Courier, monospace;"&gt;cor()&lt;/span&gt; function will produce a correlation matrix, but what if you want p-values for those correlation coefficients? Also, instead of a matrix, how might you get these statistics in tabular format (variable &lt;i&gt;i&lt;/i&gt;, variable &lt;i&gt;j&lt;/i&gt;, r, and p, for each &lt;i&gt;i&lt;/i&gt;-&lt;i&gt;j&lt;/i&gt; combination)? Here's the code (you'll need the &lt;span style="font-family: Courier New, Courier, monospace;"&gt;PerformanceAnalytics&lt;/span&gt; package to produce the plot).&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/3492773.js?file=explore-correlations.r"&gt;&lt;/script&gt;&lt;br /&gt;
The &lt;span style="font-family: Courier New, Courier, monospace;"&gt;cor()&lt;/span&gt; function will produce a basic correlation matrix. &amp;nbsp;12 years ago &lt;a href="https://stat.ethz.ch/pipermail/r-help/2000-January/009758.html"&gt;Bill Venables provided a function on the R help mailing list&lt;/a&gt; for replacing the upper triangle of the correlation matrix with the p-values for those correlations (based on the known relationship between &lt;i&gt;t&lt;/i&gt; and &lt;i&gt;r&lt;/i&gt;).&amp;nbsp;The &lt;span style="font-family: Courier New, Courier, monospace;"&gt;cor.prob()&lt;/span&gt; function will produce this matrix.&lt;br /&gt;
&lt;br /&gt;
Finally, the &lt;span style="font-family: Courier New, Courier, monospace;"&gt;flattenSquareMatrix()&lt;/span&gt; function will "flatten" this matrix to four columns: one column for variable &lt;i&gt;i&lt;/i&gt;, one for variable &lt;i&gt;j&lt;/i&gt;, one for their correlation, and another for their p-value (thanks to &lt;a href="http://stackoverflow.com/questions/12116207/flatten-matrix-in-r-to-four-columns-indexes-and-upper-lower-triangles"&gt;Chris Wallace on StackOverflow&lt;/a&gt; for helping out with this one).&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/3492876.js?file=explore-correlations-output.txt"&gt;&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
Finally, the chart.Correlation() function from the PerformanceAnalytics package produces a very nice scatterplot matrix, with histograms, kernel density overlays, absolute correlations, and significance asterisks (0.05, 0.01, 0.001):&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-p9TU0FkR0MQ/UDvzQBEEOlI/AAAAAAABA8Q/ScF-742PvWE/s1600/explore-correlations.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-p9TU0FkR0MQ/UDvzQBEEOlI/AAAAAAABA8Q/ScF-742PvWE/s1600/explore-correlations.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/_PNeI9ABL1s" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/4645337786345003486/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/08/more-on-exploring-correlations-in-r.html#comment-form" title="12 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4645337786345003486?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4645337786345003486?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/08/more-on-exploring-correlations-in-r.html" title="More on Exploring Correlations in R" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-p9TU0FkR0MQ/UDvzQBEEOlI/AAAAAAABA8Q/ScF-742PvWE/s72-c/explore-correlations.png" height="72" width="72" /><thr:total>12</thr:total></entry><entry gd:etag="W/&quot;D0cCRnk_eyp7ImA9WhJQGE8.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-2376368188321931074</id><published>2012-08-01T07:42:00.001-05:00</published><updated>2012-08-01T07:44:27.743-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-08-01T07:44:27.743-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="ENCODE" /><category scheme="http://www.blogger.com/atom/ns#" term="Recommended Reading" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Apps" /><category scheme="http://www.blogger.com/atom/ns#" term="Sequencing" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Cscan: Finding Gene Expression Regulators with ENCODE ChIP-Seq Data</title><content type="html">Recently published in &lt;a href="http://nar.oxfordjournals.org/content/40/W1/W510"&gt;Nucleic Acids Research&lt;/a&gt;:&lt;br /&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
F. Zambelli, G. M. Prazzoli, G. Pesole, G. Pavesi, Cscan: finding common regulators of a set of genes by using a collection of genome-wide ChIP-seq datasets., &lt;i&gt;Nucleic acids research&lt;/i&gt; &lt;b&gt;40&lt;/b&gt;, W510–5 (2012).&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;table align="center" cellpadding="0" cellspacing="0" class="tr-caption-container" style="margin-left: auto; margin-right: auto; text-align: center;"&gt;&lt;tbody&gt;
&lt;tr&gt;&lt;td style="text-align: center;"&gt;&lt;a href="http://nar.oxfordjournals.org/content/40/W1/W510" imageanchor="1" style="margin-left: auto; margin-right: auto;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-AQp5FJb981g/UBkjjvmBKWI/AAAAAAAA7bI/KZC45xJrBNY/s1600/cscan.png" /&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class="tr-caption" style="text-align: center;"&gt;Cscan web interface screenshot&lt;/td&gt;&lt;/tr&gt;
&lt;/tbody&gt;&lt;/table&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
This paper presents a methodology and software implementation that allows users to discover a set of transcription factors or epigenetic modifications that regulate a set of genes of interest. A wealth of data about transcription factor binding exists in the public domain, and this is a good example of a group utilizing those resources to develop tools that are of use to the broader computational biology community.&amp;nbsp;&lt;/div&gt;
&lt;div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
High-throughput gene expression experiments like microarrays and RNA-seq experiments often result in a list of differentially regulated or co-expressed genes. A common follow-up question asks which transcription factors may regulate those genes of interest. The ENCODE project has completed ChIP-seq experiments for many transcription factors and epigenetic modifications for a number of different cell lines in both human and model organisms. These researchers crossed this publicly available data on enriched regions from ChIP-seq experiments with genomic coordinates of gene annotations to create a table of gene annotations (rows) by ChIP-peak signals, with a presence/absence peak in each cell. Given a set of genes of interest (e.g. differentially regulated genes from an RNA-seq experiment), the method evaluates the over-/under-representation of target sites for the DNA binding protein in each ChIP experiment using a Fisher's exact test. Other methods based on motif-enrichment (using position weight matrices derived from databases like TRANSFAC or JASPAR) would miss DNA-binding factors like the Retinoblastoma protein (RB), which lacks a DNA-binding domain and is recruited to promoters by other transcription factors. In addition to overcoming this limitation, the method presented here also has the advantage of considering tissue-specificity and chromatin accessibility.&lt;/div&gt;
&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
The web interface is free and doesn't require registration:&amp;nbsp;&lt;a href="http://www.beaconlab.it/cscan"&gt;http://www.beaconlab.it/cscan&lt;/a&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;a href="http://nar.oxfordjournals.org/content/40/W1/W510"&gt;Nucleic Acids Research:&amp;nbsp;Cscan: finding common regulators of a set of genes by using a collection of genome-wide ChIP-seq datasets&lt;/a&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/mHQC9zJ3XPo" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/2376368188321931074/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/08/cscan-finding-gene-expression.html#comment-form" title="2 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/2376368188321931074?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/2376368188321931074?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/08/cscan-finding-gene-expression.html" title="Cscan: Finding Gene Expression Regulators with ENCODE ChIP-Seq Data" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-AQp5FJb981g/UBkjjvmBKWI/AAAAAAAA7bI/KZC45xJrBNY/s72-c/cscan.png" height="72" width="72" /><thr:total>2</thr:total></entry><entry gd:etag="W/&quot;D0MMQXk7eyp7ImA9WhJRFUs.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-4572915542945236858</id><published>2012-07-17T17:51:00.000-05:00</published><updated>2012-07-17T17:51:20.703-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-17T17:51:20.703-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="ggplot2" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Plotting the Frequency of Twitter Hashtag Usage Over Time with R and ggplot2</title><content type="html">The 20th annual &lt;a href="http://www.iscb.org/ismb2012/" target="_blank"&gt;ISMB&lt;/a&gt; meeting was held over the last week in Long Beach, CA. It was an incredible meeting with lots of interesting and relevant talks, and lots of folks were tweeting the conference, usually with at least a few people in each concurrent session. I wrote &lt;a href="https://gist.github.com/3132596" target="_blank"&gt;the code&lt;/a&gt; below that uses the &lt;a href="http://cran.r-project.org/web/packages/twitteR/" target="_blank"&gt;twitteR package&lt;/a&gt; to pull all the tweets about the meeting under the &lt;a href="https://twitter.com/#!/search/realtime/%23ISMB" target="_blank"&gt;#ISMB hashtag&lt;/a&gt;. You can &lt;a href="https://docs.google.com/spreadsheet/ccc?key=0AnpVR0cZ0J4idGpWdEtuTXdxdVgwNXAzcVA4RlU1c2c" target="_blank"&gt;download that raw data here&lt;/a&gt;. I then use&amp;nbsp;ggplot2 to plot the frequency of tweets about #ISMB over time in two hour windows for each day of the last week.&lt;br /&gt;
&lt;div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-YyK6ISQZ_tk/UAXn7WbQOXI/AAAAAAAA610/0blI7Tgzg3g/s1600/ismb-frequency.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-YyK6ISQZ_tk/UAXn7WbQOXI/AAAAAAAA610/0blI7Tgzg3g/s1600/ismb-frequency.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
The code below also tabulates the total number of tweets by username, and plots the 40 most prolific. Interestingly several of the folks in this list weren't even at the meeting.&lt;br /&gt;
&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-1S61k_2GbzA/UAXoG9bW9aI/AAAAAAAA618/b1yjP1SaLgc/s1600/ismb-users.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-1S61k_2GbzA/UAXoG9bW9aI/AAAAAAAA618/b1yjP1SaLgc/s1600/ismb-users.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
I'll update the plots above at the conclusion of the meeting.&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
Here's the code below. There's a limitation with this code - you can only retrieve a maximum of 1500 tweets per query without authenticating via OAuth before you receive a 403 error. The twitteR package had a good vignette about how to use the&lt;span style="background-color: white;"&gt;&amp;nbsp;&lt;/span&gt;&lt;a href="http://cran.r-project.org/web/packages/ROAuth/index.html" style="background-color: white;" target="_blank"&gt;ROAuth package&lt;/a&gt;&amp;nbsp;to do this, but I was never able to get it to work properly. The version on CRAN (0.9.1) has known issues, but even when rolling back to 0.9.0 or upgrading to 0.9.2 from the author's homepage, I still received the 403 signal. So my hackjob workaround was to write a loop to fetch all the tweets one day at a time and then flatten this into a single list before converting to a data frame. You still run into the limitation of only being able to retrieve the first 1500 for each day, but #ISMB never had more than 1500 any one day. If you can solve my ROAuth problem, please leave a comment or fork the &lt;a href="https://gist.github.com/3132596" target="_blank"&gt;code on GitHub&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
&lt;script src="https://gist.github.com/3132596.js?file=ismb_twitter.R"&gt;
&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div&gt;
&lt;br /&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/-n_4wvy5xtI" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/4572915542945236858/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/07/plotting-frequency-of-twitter-hashtag.html#comment-form" title="5 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4572915542945236858?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/4572915542945236858?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/07/plotting-frequency-of-twitter-hashtag.html" title="Plotting the Frequency of Twitter Hashtag Usage Over Time with R and ggplot2" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-YyK6ISQZ_tk/UAXn7WbQOXI/AAAAAAAA610/0blI7Tgzg3g/s72-c/ismb-frequency.png" height="72" width="72" /><thr:total>5</thr:total></entry><entry gd:etag="W/&quot;DEEDSXk8eip7ImA9WhJSFkg.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-5290724219896931536</id><published>2012-07-06T10:38:00.001-05:00</published><updated>2012-07-07T05:24:38.772-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-07-07T05:24:38.772-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="Visualization" /><category scheme="http://www.blogger.com/atom/ns#" term="R" /><title>Fix Overplotting with Colored Contour Lines</title><content type="html">I saw &lt;a href="http://i.imgur.com/6kVrT.png" target="_blank"&gt;this plot&lt;/a&gt;&amp;nbsp;in the supplement of a &lt;a href="http://www.biomedcentral.com/1755-8794/5/28/abstract" target="_blank"&gt;recent paper&lt;/a&gt; comparing microarray results to RNA-seq results. Nothing earth-shattering in the paper - you've probably seen a similar comparison many times before - but I liked how they solved the overplotting problem using heat-colored contour lines to indicate density. I &lt;a href="http://stats.stackexchange.com/questions/31726/scatterplot-with-contour-heat-overlay" target="_blank"&gt;asked how to reproduce this figure using R on Stack Exchange&lt;/a&gt;, and my question was quickly answered by&amp;nbsp;&lt;a href="https://twitter.com/chlalanne" target="_blank"&gt;Christophe Lalanne&lt;/a&gt;.&lt;br /&gt;
&lt;br /&gt;
Here's the R code to generate the data and all the figures here.&lt;br /&gt;
&lt;script src="https://gist.github.com/3060940.js?file=overplotting_fix.r"&gt;
&lt;/script&gt;&lt;br /&gt;
&lt;br /&gt;
Here's the problem: there are 50,000 points in this plot causing extreme overplotting. (This is a simple multivariate normal distribution, but if the distribution were more complex, overplotting might obscure a relationship in the data that you didn't know about).&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-kdCcQkI5Cgw/T_cEQf00BlI/AAAAAAAA5NU/vmHMyeKREgw/s1600/orig.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-kdCcQkI5Cgw/T_cEQf00BlI/AAAAAAAA5NU/vmHMyeKREgw/s1600/orig.png" /&gt;&lt;/a&gt;&lt;/div&gt;
I liked the solution they used in the paper referenced above. Contour lines were placed throughout the data indicating the density of the data in that region. Further, the contour lines were "heat" colored from blue to red, indicating increasing data density. Optionally, you can add vertical and horizontal lines that intersect the means, and a legend that includes the absolute correlation coefficient between the two variables.&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-q-QHIDgNs0c/T_cE2-WB-xI/AAAAAAAA5Nc/lFcpaxrgQgk/s1600/contour.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-q-QHIDgNs0c/T_cE2-WB-xI/AAAAAAAA5Nc/lFcpaxrgQgk/s1600/contour.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;span style="background-color: white;"&gt;There are many other ways to solve an overplotting problem - reducing the size of the points, making points transparent, using hex-binning.&amp;nbsp;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Using a single pixel for each data point:&lt;/span&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://3.bp.blogspot.com/-74U4ffypjoM/T_cE_lJIZZI/AAAAAAAA5Nk/1dsgqt0LKEA/s1600/pixels.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/-74U4ffypjoM/T_cE_lJIZZI/AAAAAAAA5Nk/1dsgqt0LKEA/s1600/pixels.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Using hexagonal binning to display density (hexbin package):&lt;/span&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-B647kbwxscI/T_cFBsSDo_I/AAAAAAAA5Ns/sbI_1lsaLSc/s1600/hexbin.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-B647kbwxscI/T_cFBsSDo_I/AAAAAAAA5Ns/sbI_1lsaLSc/s1600/hexbin.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;Finally, using semi-transparency (10% opacity; easiest using the ggplot2 package):&lt;/span&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://2.bp.blogspot.com/-TqnLYbVStN4/T_cFC-KOpDI/AAAAAAAA5N0/L1sXaO6TOBg/s1600/semitransparent.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://2.bp.blogspot.com/-TqnLYbVStN4/T_cFC-KOpDI/AAAAAAAA5N0/L1sXaO6TOBg/s1600/semitransparent.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;b&gt;Edit July 7, 2012&lt;/b&gt; - From Pete's comment below, the smoothScatter() function in the build in graphics package&amp;nbsp;&lt;/span&gt;&lt;span style="background-color: white;"&gt;produces a smoothed color density representation of the scatterplot, obtained through a kernel density estimate. You can change the colors using the colramp option, and change how many outliers are plotted with the nrpoints option. Here, 100 outliers are plotted as single black pixels - outliers here being points in the areas of lowest regional density.&lt;/span&gt;&lt;br /&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-Lg7DJPS1nZo/T_gOI-EoDAI/AAAAAAAA5OA/xsy-IY637pU/s1600/smoothScatter.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/-Lg7DJPS1nZo/T_gOI-EoDAI/AAAAAAAA5OA/xsy-IY637pU/s1600/smoothScatter.png" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;br /&gt;
&lt;span style="background-color: white;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;br /&gt;
&lt;span style="background-color: white;"&gt;How do you deal with overplotting when you have many points?&lt;/span&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/_ZgSWX21bvU" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/5290724219896931536/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/07/fix-overplotting-with-colored-contour.html#comment-form" title="8 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/5290724219896931536?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/5290724219896931536?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/07/fix-overplotting-with-colored-contour.html" title="Fix Overplotting with Colored Contour Lines" /><author><name>Stephen Turner</name><uri>http://www.blogger.com/profile/06656711316726116187</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="26" height="32" src="http://3.bp.blogspot.com/-aT3qBWI4VYc/TgvR9CnlS0I/AAAAAAAAMDk/KuA2GGqURcc/s220/pic2-cropped-400x500.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://3.bp.blogspot.com/-kdCcQkI5Cgw/T_cEQf00BlI/AAAAAAAA5NU/vmHMyeKREgw/s72-c/orig.png" height="72" width="72" /><thr:total>8</thr:total></entry><entry gd:etag="W/&quot;CEMNQ309eyp7ImA9WhJTGE8.&quot;"><id>tag:blogger.com,1999:blog-6232819486261696035.post-2553300851754406825</id><published>2012-06-27T13:39:00.000-05:00</published><updated>2012-06-27T13:41:32.363-05:00</updated><app:edited xmlns:app="http://www.w3.org/2007/app">2012-06-27T13:41:32.363-05:00</app:edited><category scheme="http://www.blogger.com/atom/ns#" term="dbGaP" /><category scheme="http://www.blogger.com/atom/ns#" term="Databases" /><category scheme="http://www.blogger.com/atom/ns#" term="Web Apps" /><category scheme="http://www.blogger.com/atom/ns#" term="GWAS" /><category scheme="http://www.blogger.com/atom/ns#" term="Bioinformatics" /><title>Browsing dbGAP Results</title><content type="html">&lt;br /&gt;
&lt;div class="MsoNormal"&gt;
Thanks to the excellent work of &lt;a href="http://www.genome.gov/27545692"&gt;Lucia Hindorff&lt;/a&gt; and colleagues
at NHGRI, the GWAS catalog provides a great reference for the cumulative
results of GWAS for various phenotypes.&amp;nbsp; Anyone
familiar with GWAS also likely knows about &lt;a href="http://www.ncbi.nlm.nih.gov/gap/"&gt;dbGaP&lt;/a&gt; – the NCBI repository for genotype-phenotype
relationships – and the wealth of data it contains.&amp;nbsp; While dbGaP is often thought of as a way to
get access to existing genotype data, analysis results are often deposited into
dbGaP as well.&amp;nbsp; Individual-level data
(like genotypes) are generally considered “controlled access”, requiring
special permission to retrieve or use.&amp;nbsp; Summary-level
data, such as association p-values, are a bit more accessible.&amp;nbsp; There are two tools available from the dbGaP
website: the &lt;a href="http://www.ncbi.nlm.nih.gov/projects/gapplusprev/sgap_plus.htm"&gt;Association Results Browser&lt;/a&gt; and the &lt;a href="http://www.ncbi.nlm.nih.gov/gap/PheGenI"&gt;Phenotype-GenotypeIntegrator (PheGenI)&lt;/a&gt;.&amp;nbsp; These tools provide a search
interface for examining previous GWAS associations.&amp;nbsp; &lt;o:p&gt;&lt;/o:p&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
The &lt;a href="http://www.ncbi.nlm.nih.gov/projects/gapplusprev/sgap_plus.htm"&gt;Association Results Browser&lt;/a&gt; provides a simple table
listing of associations, searchable by SNP, gene, or phenotype.&amp;nbsp; It contains the information from the &lt;a href="http://www.genome.gov/26525384"&gt;NHGRI GWAS catalog&lt;/a&gt;, as well as additional associations from dbGaP deposited
studies.&amp;nbsp; I’ve shown an example below for
multiple sclerosis.&amp;nbsp; You can restrict the
search to the dbGaP-specific results by changing the “Source” selection.&amp;nbsp; If you are looking for the impact of a SNP,
this is a nice supplement to the catalog.&amp;nbsp;
Clicking on a p-value brings up the GaP browser, which provides a more
graphical (but perhaps less useful) view of the data.&lt;o:p&gt;&lt;/o:p&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;/div&gt;
&lt;div class="separator" style="clear: both; text-align: center;"&gt;
&lt;a href="http://1.bp.blogspot.com/-kh9KwetbZ-A/T-tTKar92YI/AAAAAAAABP0/SNcozFl0NdU/s1600/assoc_browser.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="534" src="http://1.bp.blogspot.com/-kh9KwetbZ-A/T-tTKar92YI/AAAAAAAABP0/SNcozFl0NdU/s640/assoc_browser.png" width="640" /&gt;&lt;/a&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
&lt;br /&gt;&lt;/div&gt;
&lt;div class="MsoNormal"&gt;
The &lt;a href="http://www.ncbi.nlm.nih.gov/gap/PheGenI"&gt;PheGenI&lt;/a&gt; tool provides similar search functionality, but
attempts to provide phenotype categories rather than more specific phenotype
associations.&amp;nbsp; Essentially, phenotype
descriptions are linked to &lt;a href="http://www.nlm.nih.gov/mesh/"&gt;MeSH terms&lt;/a&gt; to provide categories such as “Chemicals
and Drugs”, or “Hemic and Lymphatic Diseases”.&amp;nbsp;
PheGenI seems most useful if searching from the phenotype perspective,
while the association browser seems better for SNP or Gene searches. &amp;nbsp;All these tools are under active development, and I look forward to seeing their future versions.&lt;o:p&gt;&lt;/o:p&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;Getting Genetics Done by Stephen Turner is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.&lt;/div&gt;&lt;img src="http://feeds.feedburner.com/~r/GettingGeneticsDone/~4/zgFOIZk-r1o" height="1" width="1"/&gt;</content><link rel="replies" type="application/atom+xml" href="http://gettinggeneticsdone.blogspot.com/feeds/2553300851754406825/comments/default" title="Post Comments" /><link rel="replies" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/06/browsing-dbgap-results.html#comment-form" title="1 Comments" /><link rel="edit" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/2553300851754406825?v=2" /><link rel="self" type="application/atom+xml" href="http://www.blogger.com/feeds/6232819486261696035/posts/default/2553300851754406825?v=2" /><link rel="alternate" type="text/html" href="http://gettinggeneticsdone.blogspot.com/2012/06/browsing-dbgap-results.html" title="Browsing dbGAP Results" /><author><name>Will</name><uri>http://www.blogger.com/profile/09703349044940180835</uri><email>noreply@blogger.com</email><gd:image rel="http://schemas.google.com/g/2005#thumbnail" width="32" height="32" src="http://1.bp.blogspot.com/-DPV6nrTtGHY/UPTSZniomRI/AAAAAAAABRc/_nDd2s1gwko/s220/19bf088.jpg" /></author><media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="http://1.bp.blogspot.com/-kh9KwetbZ-A/T-tTKar92YI/AAAAAAAABP0/SNcozFl0NdU/s72-c/assoc_browser.png" height="72" width="72" /><thr:total>1</thr:total></entry></feed>
