File: exploringdatasets.html

package info (click to toggle)
r-cran-sjmisc 2.8.10-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 1,232 kB
sloc: sh: 13; makefile: 2
file content (605 lines) | stat: -rw-r--r-- 34,210 bytes
<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />

<meta name="viewport" content="width=device-width, initial-scale=1" />

<meta name="author" content="Daniel Lüdecke" />

<meta name="date" content="2024-05-13" />

<title>Exploring Data Sets</title>

<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
  var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
  var i, h, a;
  for (i = 0; i < hs.length; i++) {
    h = hs[i];
    if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
    a = h.attributes;
    while (a.length > 0) h.removeAttribute(a[0].name);
  }
});
</script>

<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>



<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } 
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } 
code span.at { color: #7d9029; } 
code span.bn { color: #40a070; } 
code span.bu { color: #008000; } 
code span.cf { color: #007020; font-weight: bold; } 
code span.ch { color: #4070a0; } 
code span.cn { color: #880000; } 
code span.co { color: #60a0b0; font-style: italic; } 
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } 
code span.do { color: #ba2121; font-style: italic; } 
code span.dt { color: #902000; } 
code span.dv { color: #40a070; } 
code span.er { color: #ff0000; font-weight: bold; } 
code span.ex { } 
code span.fl { color: #40a070; } 
code span.fu { color: #06287e; } 
code span.im { color: #008000; font-weight: bold; } 
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } 
code span.kw { color: #007020; font-weight: bold; } 
code span.op { color: #666666; } 
code span.ot { color: #007020; } 
code span.pp { color: #bc7a00; } 
code span.sc { color: #4070a0; } 
code span.ss { color: #bb6688; } 
code span.st { color: #4070a0; } 
code span.va { color: #19177c; } 
code span.vs { color: #4070a0; } 
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } 
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    var j = 0;
    while (j < rules.length) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
        j++;
        continue;
      }
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') {
        j++;
        continue;
      }
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>




<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap; 
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }

code > span.kw { color: #555; font-weight: bold; } 
code > span.dt { color: #902000; } 
code > span.dv { color: #40a070; } 
code > span.bn { color: #d14; } 
code > span.fl { color: #d14; } 
code > span.ch { color: #d14; } 
code > span.st { color: #d14; } 
code > span.co { color: #888888; font-style: italic; } 
code > span.ot { color: #007020; } 
code > span.al { color: #ff0000; font-weight: bold; } 
code > span.fu { color: #900; font-weight: bold; } 
code > span.er { color: #a61717; background-color: #e3d2d2; } 
</style>




</head>

<body>




<h1 class="title toc-ignore">Exploring Data Sets</h1>
<h4 class="author">Daniel Lüdecke</h4>
<h4 class="date">2024-05-13</h4>



<p>Tidying up, transforming and exploring data is an important part of
data analysis, and you can manage many common tasks in this process with
the <em>tidyverse</em> or related packages. The
<strong>sjmisc</strong>-package fits into this workflow, especially when
you work with <a href="https://cran.r-project.org/package=sjlabelled">labelled data</a>,
because it offers functions for data transformation and labelled data
utility functions. This vignette describes typical steps when beginning
with data exploration.</p>
<p>The examples are based on data from the EUROFAMCARE project, a survey
on the situation of family carers of older people in Europe. The sample
data set <code>efc</code> is part of this package. Let us see how the
family carer’s gender and subjective perception of negative impact of
care as well as the cared-for person’s dependency are associated with
the family carer’s quality of life.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="fu">library</span>(sjmisc)</span>
<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a><span class="fu">library</span>(dplyr)</span>
<span id="cb1-3"><a href="#cb1-3" tabindex="-1"></a><span class="fu">data</span>(efc)</span></code></pre></div>
<div id="print-frequencies-with-labels" class="section level2">
<h2>Print frequencies with labels</h2>
<p>The first thing that may be of interest is probably the distribution
of gender. You can plot frequencies for labelled data with
<code>frq()</code>. This function requires either a vector or data frame
as input and prints the variable label as first line, followed by a
frequency-table with values, labels, counts and percentages of the
vector.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="fu">frq</span>(efc<span class="sc">$</span>c161sex)</span>
<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="co">#&gt; carer&#39;s gender (x) &lt;numeric&gt; </span></span>
<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="co">#&gt; # total N=908 valid N=901 mean=1.76 sd=0.43</span></span>
<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a><span class="co">#&gt; Value |  Label |   N | Raw % | Valid % | Cum. %</span></span>
<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a><span class="co">#&gt; -----------------------------------------------</span></span>
<span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a><span class="co">#&gt;     1 |   Male | 215 | 23.68 |   23.86 |  23.86</span></span>
<span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a><span class="co">#&gt;     2 | Female | 686 | 75.55 |   76.14 | 100.00</span></span>
<span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a><span class="co">#&gt;  &lt;NA&gt; |   &lt;NA&gt; |   7 |  0.77 |    &lt;NA&gt; |   &lt;NA&gt;</span></span></code></pre></div>
</div>
<div id="find-variables-in-a-data-frame" class="section level2">
<h2>Find variables in a data frame</h2>
<p>Next, let’s look at the distribution of gender by the cared-for
person’s dependency. To compute cross tables, you can use
<code>flat_table()</code>. It requires the data as first argument,
followed by any number of variable names.</p>
<p>But first, we need to know the name of the dependency-variable. This
is where <code>find_var()</code> comes into play. It searches for
variables in a data frame by</p>
<ol style="list-style-type: decimal">
<li>variable names,</li>
<li>variable labels,</li>
<li>value labels</li>
<li>or any combination of these.</li>
</ol>
<p>By default, it looks for variable name and labels. The function also
supports regex-patterns. By default, <code>find_var()</code> returns the
column-indices, but you can also print a small “summary”” with the
<code>out</code>-argument.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="co"># find all variables with &quot;dependency&quot; in name or label</span></span>
<span id="cb3-2"><a href="#cb3-2" tabindex="-1"></a><span class="fu">find_var</span>(efc, <span class="st">&quot;dependency&quot;</span>, <span class="at">out =</span> <span class="st">&quot;table&quot;</span>)</span>
<span id="cb3-3"><a href="#cb3-3" tabindex="-1"></a><span class="co">#&gt;   col.nr var.name          var.label</span></span>
<span id="cb3-4"><a href="#cb3-4" tabindex="-1"></a><span class="co">#&gt; 1      5   e42dep elder&#39;s dependency</span></span></code></pre></div>
<p>Variable in column 5, named <em>e42dep</em>, is what we are looking
for.</p>
</div>
<div id="print-crosstables-with-labels" class="section level2">
<h2>Print crosstables with labels</h2>
<p>Now we can look at the distribution of gender by dependency:</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="fu">flat_table</span>(efc, e42dep, c161sex)</span>
<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a><span class="co">#&gt;                      c161sex Male Female</span></span>
<span id="cb4-3"><a href="#cb4-3" tabindex="-1"></a><span class="co">#&gt; e42dep                                  </span></span>
<span id="cb4-4"><a href="#cb4-4" tabindex="-1"></a><span class="co">#&gt; independent                    18     48</span></span>
<span id="cb4-5"><a href="#cb4-5" tabindex="-1"></a><span class="co">#&gt; slightly dependent             54    170</span></span>
<span id="cb4-6"><a href="#cb4-6" tabindex="-1"></a><span class="co">#&gt; moderately dependent           80    226</span></span>
<span id="cb4-7"><a href="#cb4-7" tabindex="-1"></a><span class="co">#&gt; severely dependent             63    241</span></span></code></pre></div>
<p>Since the distribution of male and female carers is skewed, let’s see
the proportions. To compute crosstables with row or column percentages,
use the <code>margin</code>-argument:</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a><span class="fu">flat_table</span>(efc, e42dep, c161sex, <span class="at">margin =</span> <span class="st">&quot;col&quot;</span>)</span>
<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a><span class="co">#&gt;                      c161sex  Male Female</span></span>
<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a><span class="co">#&gt; e42dep                                   </span></span>
<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a><span class="co">#&gt; independent                   8.37   7.01</span></span>
<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a><span class="co">#&gt; slightly dependent           25.12  24.82</span></span>
<span id="cb5-6"><a href="#cb5-6" tabindex="-1"></a><span class="co">#&gt; moderately dependent         37.21  32.99</span></span>
<span id="cb5-7"><a href="#cb5-7" tabindex="-1"></a><span class="co">#&gt; severely dependent           29.30  35.18</span></span></code></pre></div>
</div>
<div id="recoding-variables" class="section level2">
<h2>Recoding variables</h2>
<p>Next, we need the negatice impact of care (<em>neg_c_7</em>) and want
to create three groups: low, middle and high negative impact. We can
easily recode and label vectors with <code>rec()</code>. This function
does not only recode vectors, it also allows direct labelling of
categories inside the recode-syntax (this is optional, you can also use
the <code>val.labels</code>-argument). We now recode <em>neg_c_7</em>
into a new variable <em>burden</em>. The cut-points are a bit arbitrary,
for the sake of demonstration.</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a>efc<span class="sc">$</span>burden <span class="ot">&lt;-</span> <span class="fu">rec</span>(</span>
<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a>  efc<span class="sc">$</span>neg_c_7,</span>
<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a>  <span class="at">rec =</span> <span class="fu">c</span>(<span class="st">&quot;min:9=1 [low]; 10:12=2 [moderate]; 13:max=3 [high]; else=NA&quot;</span>),</span>
<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a>  <span class="at">var.label =</span> <span class="st">&quot;Subjective burden&quot;</span>,</span>
<span id="cb6-5"><a href="#cb6-5" tabindex="-1"></a>  <span class="at">as.num =</span> <span class="cn">FALSE</span> <span class="co"># we want a factor</span></span>
<span id="cb6-6"><a href="#cb6-6" tabindex="-1"></a>)</span>
<span id="cb6-7"><a href="#cb6-7" tabindex="-1"></a><span class="co"># print frequencies</span></span>
<span id="cb6-8"><a href="#cb6-8" tabindex="-1"></a><span class="fu">frq</span>(efc<span class="sc">$</span>burden)</span>
<span id="cb6-9"><a href="#cb6-9" tabindex="-1"></a><span class="co">#&gt; Subjective burden (x) &lt;categorical&gt; </span></span>
<span id="cb6-10"><a href="#cb6-10" tabindex="-1"></a><span class="co">#&gt; # total N=908 valid N=892 mean=2.03 sd=0.81</span></span>
<span id="cb6-11"><a href="#cb6-11" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb6-12"><a href="#cb6-12" tabindex="-1"></a><span class="co">#&gt; Value |    Label |   N | Raw % | Valid % | Cum. %</span></span>
<span id="cb6-13"><a href="#cb6-13" tabindex="-1"></a><span class="co">#&gt; -------------------------------------------------</span></span>
<span id="cb6-14"><a href="#cb6-14" tabindex="-1"></a><span class="co">#&gt;     1 |      low | 280 | 30.84 |   31.39 |  31.39</span></span>
<span id="cb6-15"><a href="#cb6-15" tabindex="-1"></a><span class="co">#&gt;     2 | moderate | 301 | 33.15 |   33.74 |  65.13</span></span>
<span id="cb6-16"><a href="#cb6-16" tabindex="-1"></a><span class="co">#&gt;     3 |     high | 311 | 34.25 |   34.87 | 100.00</span></span>
<span id="cb6-17"><a href="#cb6-17" tabindex="-1"></a><span class="co">#&gt;  &lt;NA&gt; |     &lt;NA&gt; |  16 |  1.76 |    &lt;NA&gt; |   &lt;NA&gt;</span></span></code></pre></div>
<p>You can see the variable <em>burden</em> has a variable label
(“Subjective burden”), which was set inside <code>rec()</code>, as well
as three values with labels (“low”, “moderate” and “high”). From the
lowest value in <em>neg_c_7</em> to 9 were recoded into 1, values 10 to
12 into 2 and values 13 to the highest value in <em>neg_c_7</em> into 3.
All remaining values are set to missing (<code>else=NA</code> – for
details on the recode-syntax, see <code>?rec</code>).</p>
</div>
<div id="grouped-data-frames" class="section level2">
<h2>Grouped data frames</h2>
<p>How is burden distributed by gender? We can group the data and print
frequencies using <code>frq()</code> for this as well, as this function
also accepts grouped data frames. Frequencies for grouped data frames
first print the group-details (variable name and category), followed by
the frequency table. Thanks to labelled data, the output is easy to
understand.</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a>efc <span class="sc">%&gt;%</span> </span>
<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a>  <span class="fu">select</span>(burden, c161sex) <span class="sc">%&gt;%</span> </span>
<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a>  <span class="fu">group_by</span>(c161sex) <span class="sc">%&gt;%</span> </span>
<span id="cb7-4"><a href="#cb7-4" tabindex="-1"></a>  <span class="fu">frq</span>()</span>
<span id="cb7-5"><a href="#cb7-5" tabindex="-1"></a><span class="co">#&gt; Subjective burden (burden) &lt;categorical&gt; </span></span>
<span id="cb7-6"><a href="#cb7-6" tabindex="-1"></a><span class="co">#&gt; # grouped by: Male</span></span>
<span id="cb7-7"><a href="#cb7-7" tabindex="-1"></a><span class="co">#&gt; # total N=215 valid N=212 mean=1.91 sd=0.81</span></span>
<span id="cb7-8"><a href="#cb7-8" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb7-9"><a href="#cb7-9" tabindex="-1"></a><span class="co">#&gt; Value |    Label |  N | Raw % | Valid % | Cum. %</span></span>
<span id="cb7-10"><a href="#cb7-10" tabindex="-1"></a><span class="co">#&gt; ------------------------------------------------</span></span>
<span id="cb7-11"><a href="#cb7-11" tabindex="-1"></a><span class="co">#&gt;     1 |      low | 80 | 37.21 |   37.74 |  37.74</span></span>
<span id="cb7-12"><a href="#cb7-12" tabindex="-1"></a><span class="co">#&gt;     2 | moderate | 72 | 33.49 |   33.96 |  71.70</span></span>
<span id="cb7-13"><a href="#cb7-13" tabindex="-1"></a><span class="co">#&gt;     3 |     high | 60 | 27.91 |   28.30 | 100.00</span></span>
<span id="cb7-14"><a href="#cb7-14" tabindex="-1"></a><span class="co">#&gt;  &lt;NA&gt; |     &lt;NA&gt; |  3 |  1.40 |    &lt;NA&gt; |   &lt;NA&gt;</span></span>
<span id="cb7-15"><a href="#cb7-15" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb7-16"><a href="#cb7-16" tabindex="-1"></a><span class="co">#&gt; Subjective burden (burden) &lt;categorical&gt; </span></span>
<span id="cb7-17"><a href="#cb7-17" tabindex="-1"></a><span class="co">#&gt; # grouped by: Female</span></span>
<span id="cb7-18"><a href="#cb7-18" tabindex="-1"></a><span class="co">#&gt; # total N=686 valid N=679 mean=2.08 sd=0.81</span></span>
<span id="cb7-19"><a href="#cb7-19" tabindex="-1"></a><span class="co">#&gt; </span></span>
<span id="cb7-20"><a href="#cb7-20" tabindex="-1"></a><span class="co">#&gt; Value |    Label |   N | Raw % | Valid % | Cum. %</span></span>
<span id="cb7-21"><a href="#cb7-21" tabindex="-1"></a><span class="co">#&gt; -------------------------------------------------</span></span>
<span id="cb7-22"><a href="#cb7-22" tabindex="-1"></a><span class="co">#&gt;     1 |      low | 199 | 29.01 |   29.31 |  29.31</span></span>
<span id="cb7-23"><a href="#cb7-23" tabindex="-1"></a><span class="co">#&gt;     2 | moderate | 229 | 33.38 |   33.73 |  63.03</span></span>
<span id="cb7-24"><a href="#cb7-24" tabindex="-1"></a><span class="co">#&gt;     3 |     high | 251 | 36.59 |   36.97 | 100.00</span></span>
<span id="cb7-25"><a href="#cb7-25" tabindex="-1"></a><span class="co">#&gt;  &lt;NA&gt; |     &lt;NA&gt; |   7 |  1.02 |    &lt;NA&gt; |   &lt;NA&gt;</span></span></code></pre></div>
</div>
<div id="nested-data-frames" class="section level2">
<h2>Nested data frames</h2>
<p>Let’s investigate the association between quality of life and burden
across the different dependency categories, by fitting linear models for
each category of <em>e42dep</em>. We can do this using <em>nested data
frames</em>. <code>nest()</code> from the <strong>tidyr</strong>-package
can create subsets of a data frame, based on grouping criteria, and
create a new <em>list-variable</em>, where each element itself is a data
frame (so it’s nested, because we have data frames inside a data
frame).</p>
<p>In the following example, we group the data by <em>e42dep</em>, and
“nest” the groups. Now we get a data frame with two columns: First, the
grouping variable (<em>e42dep</em>) and second, the datasets (subsets)
for each country as data frame, stored in the list-variable
<em>data</em>. The data frames in the subsets (in <em>data</em>) all
contain the selected variables <em>burden</em>, <em>c161sex</em> and
<em>quol_5</em> (quality of life).</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a><span class="co"># convert variable to labelled factor, because we then </span></span>
<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a><span class="co"># have the labels as factor levels in the output</span></span>
<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a>efc<span class="sc">$</span>e42dep <span class="ot">&lt;-</span> <span class="fu">to_label</span>(efc<span class="sc">$</span>e42dep, <span class="at">drop.levels =</span> <span class="cn">TRUE</span>)</span>
<span id="cb8-4"><a href="#cb8-4" tabindex="-1"></a>efc <span class="sc">%&gt;%</span></span>
<span id="cb8-5"><a href="#cb8-5" tabindex="-1"></a>  <span class="fu">select</span>(e42dep, burden, c161sex, quol_5) <span class="sc">%&gt;%</span></span>
<span id="cb8-6"><a href="#cb8-6" tabindex="-1"></a>  <span class="fu">group_by</span>(e42dep) <span class="sc">%&gt;%</span></span>
<span id="cb8-7"><a href="#cb8-7" tabindex="-1"></a>  tidyr<span class="sc">::</span><span class="fu">nest</span>()</span>
<span id="cb8-8"><a href="#cb8-8" tabindex="-1"></a><span class="co">#&gt; # A tibble: 5 × 2</span></span>
<span id="cb8-9"><a href="#cb8-9" tabindex="-1"></a><span class="co">#&gt; # Groups:   e42dep [5]</span></span>
<span id="cb8-10"><a href="#cb8-10" tabindex="-1"></a><span class="co">#&gt;   e42dep               data              </span></span>
<span id="cb8-11"><a href="#cb8-11" tabindex="-1"></a><span class="co">#&gt;   &lt;fct&gt;                &lt;list&gt;            </span></span>
<span id="cb8-12"><a href="#cb8-12" tabindex="-1"></a><span class="co">#&gt; 1 moderately dependent &lt;tibble [306 × 3]&gt;</span></span>
<span id="cb8-13"><a href="#cb8-13" tabindex="-1"></a><span class="co">#&gt; 2 severely dependent   &lt;tibble [304 × 3]&gt;</span></span>
<span id="cb8-14"><a href="#cb8-14" tabindex="-1"></a><span class="co">#&gt; 3 independent          &lt;tibble [66 × 3]&gt; </span></span>
<span id="cb8-15"><a href="#cb8-15" tabindex="-1"></a><span class="co">#&gt; 4 slightly dependent   &lt;tibble [225 × 3]&gt;</span></span>
<span id="cb8-16"><a href="#cb8-16" tabindex="-1"></a><span class="co">#&gt; 5 &lt;NA&gt;                 &lt;tibble [7 × 3]&gt;</span></span></code></pre></div>
</div>
<div id="get-coefficients-of-nested-models" class="section level2">
<h2>Get coefficients of nested models</h2>
<p>Using <code>map()</code> from the <strong>purrr</strong>-package, we
can iterate this list and apply any function on each data frame in the
list-variable “data”. We want to apply the <code>lm()</code>-function to
the list-variable, to run linear models for all “dependency-datasets”.
The results of these linear regressions are stored in another
list-variable, <em>models</em> (created with <code>mutate()</code>). To
quickly access and look at the coefficients, we can use
<code>spread_coef()</code>.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>efc <span class="sc">%&gt;%</span></span>
<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a>  <span class="fu">select</span>(e42dep, burden, c161sex, quol_5) <span class="sc">%&gt;%</span></span>
<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a>  <span class="fu">group_by</span>(e42dep) <span class="sc">%&gt;%</span></span>
<span id="cb9-4"><a href="#cb9-4" tabindex="-1"></a>  tidyr<span class="sc">::</span><span class="fu">nest</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb9-5"><a href="#cb9-5" tabindex="-1"></a>  <span class="fu">na.omit</span>() <span class="sc">%&gt;%</span>       <span class="co"># remove nested group for NA</span></span>
<span id="cb9-6"><a href="#cb9-6" tabindex="-1"></a>  <span class="fu">arrange</span>(e42dep) <span class="sc">%&gt;%</span> <span class="co"># arrange by order of levels</span></span>
<span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">models =</span> purrr<span class="sc">::</span><span class="fu">map</span>(</span>
<span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a>    data, <span class="sc">~</span> </span>
<span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a>    <span class="fu">lm</span>(quol_5 <span class="sc">~</span> burden <span class="sc">+</span> c161sex, <span class="at">data =</span> .))</span>
<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
<span id="cb9-11"><a href="#cb9-11" tabindex="-1"></a>  <span class="fu">spread_coef</span>(models)</span>
<span id="cb9-12"><a href="#cb9-12" tabindex="-1"></a><span class="co">#&gt; # A tibble: 4 × 7</span></span>
<span id="cb9-13"><a href="#cb9-13" tabindex="-1"></a><span class="co">#&gt; # Groups:   e42dep [4]</span></span>
<span id="cb9-14"><a href="#cb9-14" tabindex="-1"></a><span class="co">#&gt;   e42dep               data     models `(Intercept)` burden2 burden3 c161sex</span></span>
<span id="cb9-15"><a href="#cb9-15" tabindex="-1"></a><span class="co">#&gt;   &lt;fct&gt;                &lt;list&gt;   &lt;list&gt;         &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;   &lt;dbl&gt;</span></span>
<span id="cb9-16"><a href="#cb9-16" tabindex="-1"></a><span class="co">#&gt; 1 independent          &lt;tibble&gt; &lt;lm&gt;            18.8   -3.16   -4.94  -0.709</span></span>
<span id="cb9-17"><a href="#cb9-17" tabindex="-1"></a><span class="co">#&gt; 2 slightly dependent   &lt;tibble&gt; &lt;lm&gt;            19.8   -2.20   -2.48  -1.14 </span></span>
<span id="cb9-18"><a href="#cb9-18" tabindex="-1"></a><span class="co">#&gt; 3 moderately dependent &lt;tibble&gt; &lt;lm&gt;            17.9   -1.82   -5.29  -0.637</span></span>
<span id="cb9-19"><a href="#cb9-19" tabindex="-1"></a><span class="co">#&gt; 4 severely dependent   &lt;tibble&gt; &lt;lm&gt;            19.1   -3.66   -7.92  -0.746</span></span></code></pre></div>
<p>We see that higher burden is associated with lower quality of life,
for all dependency-groups. The <code>se</code> and
<code>p.val</code>-arguments add standard errors and p-values to the
output. <code>model.term</code> returns the statistics only for a
specific term. If you specify a <code>model.term</code>, arguments
<code>se</code> and <code>p.val</code> automatically default to
<code>TRUE</code>.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>efc <span class="sc">%&gt;%</span></span>
<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a>  <span class="fu">select</span>(e42dep, burden, c161sex, quol_5) <span class="sc">%&gt;%</span></span>
<span id="cb10-3"><a href="#cb10-3" tabindex="-1"></a>  <span class="fu">group_by</span>(e42dep) <span class="sc">%&gt;%</span></span>
<span id="cb10-4"><a href="#cb10-4" tabindex="-1"></a>  tidyr<span class="sc">::</span><span class="fu">nest</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb10-5"><a href="#cb10-5" tabindex="-1"></a>  <span class="fu">na.omit</span>() <span class="sc">%&gt;%</span>       <span class="co"># remove nested group for NA</span></span>
<span id="cb10-6"><a href="#cb10-6" tabindex="-1"></a>  <span class="fu">arrange</span>(e42dep) <span class="sc">%&gt;%</span> <span class="co"># arrange by order of levels</span></span>
<span id="cb10-7"><a href="#cb10-7" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">models =</span> purrr<span class="sc">::</span><span class="fu">map</span>(</span>
<span id="cb10-8"><a href="#cb10-8" tabindex="-1"></a>    data, <span class="sc">~</span> </span>
<span id="cb10-9"><a href="#cb10-9" tabindex="-1"></a>    <span class="fu">lm</span>(quol_5 <span class="sc">~</span> burden <span class="sc">+</span> c161sex, <span class="at">data =</span> .))</span>
<span id="cb10-10"><a href="#cb10-10" tabindex="-1"></a>  ) <span class="sc">%&gt;%</span></span>
<span id="cb10-11"><a href="#cb10-11" tabindex="-1"></a>  <span class="fu">spread_coef</span>(models, burden3)</span>
<span id="cb10-12"><a href="#cb10-12" tabindex="-1"></a><span class="co">#&gt; # A tibble: 4 × 6</span></span>
<span id="cb10-13"><a href="#cb10-13" tabindex="-1"></a><span class="co">#&gt; # Groups:   e42dep [4]</span></span>
<span id="cb10-14"><a href="#cb10-14" tabindex="-1"></a><span class="co">#&gt;   e42dep               data               models burden3 std.error  p.value</span></span>
<span id="cb10-15"><a href="#cb10-15" tabindex="-1"></a><span class="co">#&gt;   &lt;fct&gt;                &lt;list&gt;             &lt;list&gt;   &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;</span></span>
<span id="cb10-16"><a href="#cb10-16" tabindex="-1"></a><span class="co">#&gt; 1 independent          &lt;tibble [66 × 3]&gt;  &lt;lm&gt;     -4.94     2.20  2.84e- 2</span></span>
<span id="cb10-17"><a href="#cb10-17" tabindex="-1"></a><span class="co">#&gt; 2 slightly dependent   &lt;tibble [225 × 3]&gt; &lt;lm&gt;     -2.48     0.694 4.25e- 4</span></span>
<span id="cb10-18"><a href="#cb10-18" tabindex="-1"></a><span class="co">#&gt; 3 moderately dependent &lt;tibble [306 × 3]&gt; &lt;lm&gt;     -5.29     0.669 5.22e-14</span></span>
<span id="cb10-19"><a href="#cb10-19" tabindex="-1"></a><span class="co">#&gt; 4 severely dependent   &lt;tibble [304 × 3]&gt; &lt;lm&gt;     -7.92     0.875 2.10e-17</span></span></code></pre></div>
</div>



<!-- code folding -->


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>