File: fix_random_seeds.patch

package info (click to toggle)
pandas 2.3.3%2Bdfsg-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 66,992 kB
sloc: python: 425,583; ansic: 9,219; sh: 264; xml: 102; makefile: 85
file content (209 lines) | stat: -rw-r--r-- 6,815 bytes
Description: Use fixed seeds for reproducible pseudorandomness

Author: Rebecca N. Palmer <rebecca_palmer@zoho.com>
Forwarded: no

--- a/doc/source/getting_started/comparison/comparison_with_r.rst
+++ b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -237,6 +237,7 @@ In pandas we may use :meth:`~pandas.pivo
 
    import random
    import string
+   random.seed(123456) # for reproducibility
 
    baseball = pd.DataFrame(
        {
--- a/doc/source/user_guide/10min.rst
+++ b/doc/source/user_guide/10min.rst
@@ -701,6 +701,11 @@ We use the standard convention for refer
 The ``plt.close`` method is used to `close <https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.close.html>`__ a figure window:
 
 .. ipython:: python
+   :suppress:
+
+   np.random.seed(123456)  # for reproducibility
+
+.. ipython:: python
 
    ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000))
    ts = ts.cumsum()
@@ -717,6 +722,11 @@ The ``plt.close`` method is used to `clo
 :meth:`~DataFrame.plot` plots all columns:
 
 .. ipython:: python
+   :suppress:
+
+   np.random.seed(123456)  # for reproducibility
+
+.. ipython:: python
 
    df = pd.DataFrame(
        np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"]
--- a/doc/source/user_guide/advanced.rst
+++ b/doc/source/user_guide/advanced.rst
@@ -590,6 +590,7 @@ they need to be sorted. As with any inde
 
    import random
 
+   random.seed(123456) # for reproducibility
    random.shuffle(tuples)
    s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
    s
--- a/doc/source/user_guide/visualization.rst
+++ b/doc/source/user_guide/visualization.rst
@@ -1090,6 +1090,7 @@ are what constitutes the bootstrap plot.
 .. ipython:: python
 
    from pandas.plotting import bootstrap_plot
+   random.seed(123456) # for reproducibility - bootstrap_plot uses random.sample
 
    data = pd.Series(np.random.rand(1000))
 
--- a/pandas/plotting/_core.py
+++ b/pandas/plotting/_core.py
@@ -604,6 +604,7 @@ def boxplot_frame_groupby(
     .. plot::
         :context: close-figs
 
+        >>> np.random.seed(1234)
         >>> import itertools
         >>> tuples = [t for t in itertools.product(range(1000), range(4))]
         >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
@@ -1328,6 +1329,7 @@ class PlotAccessor(PandasObject):
         .. plot::
             :context: close-figs
 
+            >>> np.random.seed(1234)
             >>> data = np.random.randn(25, 4)
             >>> df = pd.DataFrame(data, columns=list('ABCD'))
             >>> ax = df.plot.box()
@@ -1392,6 +1394,7 @@ class PlotAccessor(PandasObject):
         .. plot::
             :context: close-figs
 
+            >>> np.random.seed(1234)
             >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one'])
             >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
             >>> ax = df.plot.hist(bins=12, alpha=0.5)
@@ -1811,6 +1814,7 @@ class PlotAccessor(PandasObject):
         .. plot::
             :context: close-figs
 
+            >>> np.random.seed(1234)
             >>> n = 10000
             >>> df = pd.DataFrame({'x': np.random.randn(n),
             ...                    'y': np.random.randn(n)})
--- a/pandas/plotting/_misc.py
+++ b/pandas/plotting/_misc.py
@@ -204,6 +204,7 @@ def scatter_matrix(
     .. plot::
         :context: close-figs
 
+        >>> np.random.seed(1234)
         >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
         >>> pd.plotting.scatter_matrix(df, alpha=0.2)
         array([[<Axes: xlabel='A', ylabel='A'>, <Axes: xlabel='B', ylabel='A'>,
@@ -438,6 +439,8 @@ def bootstrap_plot(
     .. plot::
         :context: close-figs
 
+        >>> np.random.seed(1234)
+        >>> random.seed(1234)  # for reproducibility
         >>> s = pd.Series(np.random.uniform(size=100))
         >>> pd.plotting.bootstrap_plot(s)  # doctest: +SKIP
         <Figure size 640x480 with 6 Axes>
@@ -597,6 +600,7 @@ def autocorrelation_plot(series: Series,
     .. plot::
         :context: close-figs
 
+        >>> np.random.seed(1234)
         >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000)
         >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing))
         >>> pd.plotting.autocorrelation_plot(s)  # doctest: +SKIP
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -78,8 +78,37 @@
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
-    "import matplotlib as mpl\n",
-    "\n",
+    "import matplotlib as mpl\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "# For reproducibility - this doesn't respect uuid_len or positionally-passed uuid but the places here that use that coincidentally bypass this anyway\n",
+    "from pandas.io.formats.style import Styler\n",
+    "next_uuid = 1000\n",
+    "class StylerReproducible(Styler):\n",
+    "    def __init__(self, *args, uuid=None, **kwargs):\n",
+    "        global next_uuid\n",
+    "        if uuid is None:\n",
+    "            uuid = str(next_uuid)\n",
+    "            next_uuid = next_uuid + 1\n",
+    "        super().__init__(*args, uuid=uuid, **kwargs)\n",
+    "Styler = StylerReproducible\n",
+    "pd.DataFrame.style = property(lambda self: StylerReproducible(self))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "df = pd.DataFrame({\n",
     "    \"strings\": [\"Adam\", \"Mike\"],\n",
     "    \"ints\": [1, 3],\n",
@@ -104,6 +133,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "np.random.seed(25)  # for reproducibility\n",
     "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n",
     "                          index=pd.date_range(start=\"2021-01-01\", periods=10),\n",
     "                          columns=[\"Tokyo\", \"Beijing\"])\n",
@@ -1394,7 +1424,6 @@
    "outputs": [],
    "source": [
     "# Hide the construction of the display chart from the user\n",
-    "import pandas as pd\n",
     "from IPython.display import HTML\n",
     "\n",
     "# Test series\n",
@@ -1926,6 +1955,18 @@
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "# For reproducibility\n",
+    "Styler = StylerReproducible\n"
+   ]
+  },
+  {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -2126,7 +2167,8 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.9.5"
-  }
+  },
+  "record_timing": false
  },
  "nbformat": 4,
  "nbformat_minor": 1