File: dr7_quasar.py

package info (click to toggle)
astroml 1.0.2-6
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 932 kB
sloc: python: 5,731; makefile: 3
file content (115 lines) | stat: -rw-r--r-- 3,528 bytes
parent folder | download | duplicates (2)
"""
SDSS DR7 Quasar Dataset Loader.

This implements a loader for the DR7 quasar dataset, located at
http://www.sdss.org/dr7/products/value added/qsocat_dr7.html
"""
import os
from gzip import GzipFile
from io import BytesIO

import numpy as np

from .tools import download_with_progress_bar
from . import get_data_home

DATA_URL = 'http://das.sdss.org/va/qsocat/dr7qso.dat.gz'

ARCHIVE_FILE = 'dr7_quasar.npy'

# column numbers for extraction
DR7_DTYPE = [('sdssID', 'a14'),
             ('RA', 'f8'),
             ('dec', 'f8'),
             ('redshift', 'f4'),
             ('mag_u', 'f4'),
             ('err_u', 'f4'),
             ('mag_g', 'f4'),
             ('err_g', 'f4'),
             ('mag_r', 'f4'),
             ('err_r', 'f4'),
             ('mag_i', 'f4'),
             ('err_i', 'f4'),
             ('mag_z', 'f4'),
             ('err_z', 'f4'),
             ('mag_J', 'f4'),
             ('err_J', 'f4'),
             ('mag_H', 'f4'),
             ('err_H', 'f4'),
             ('mag_K', 'f4'),
             ('err_K', 'f4'),
             ('specobjid', 'i8')]

COLUMN_NUMBERS = [0, 1, 2, 3,
                  4, 5, 6, 7,
                  8, 9, 10, 11, 12, 13,
                  22, 23, 24, 25, 26, 27, 72]

# length of header information
SKIP_ROWS = 80


def fetch_dr7_quasar(data_home=None, download_if_missing=True):
    """Loader for SDSS DR7 quasar catalog

    Parameters
    ----------
    data_home : optional, default=None
        Specify another download and cache folder for the datasets. By default
        all astroML data is stored in '~/astroML_data'.

    download_if_missing : optional, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    data : ndarray, shape = (105783,)
        numpy record array containing the quasar catalog

    Examples
    --------
    >>> from astroML.datasets import fetch_dr7_quasar
    >>> data = fetch_dr7_quasar()  # doctest: +IGNORE_OUTPUT +REMOTE_DATA
    >>> u_g = data['mag_u'] - data['mag_g']  # doctest: +REMOTE_DATA
    >>> u_g[:3]  # first three u-g colors  # doctest: +REMOTE_DATA
    array([-0.07699966,  0.03600121,  0.10900116], dtype=float32)

    Notes
    -----
    Not all available data is extracted and saved.  The extracted columns are:

    sdssID, RA, DEC, redshift, mag_u, err_u, mag_g, err_g, mag_r, err_r,
    mag_i, err_i, mag_z, err_z, mag_J, err_J, mag_H, err_H, mag_K, err_K,
    specobjid

    many of the objects are missing 2mass photometry.

    More information at
    http://www.sdss.org/dr7/products/value_added/qsocat_dr7.html
    """
    data_home = get_data_home(data_home)

    archive_file = os.path.join(data_home, ARCHIVE_FILE)

    if not os.path.exists(archive_file):
        if not download_if_missing:
            raise IOError('data not present on disk. '
                          'set download_if_missing=True to download')

        print("downloading DR7 quasar dataset from %s to %s"
              % (DATA_URL, data_home))

        zipped_buf = download_with_progress_bar(DATA_URL, return_buffer=True)
        gzf = GzipFile(fileobj=zipped_buf, mode='rb')
        extracted_buf = BytesIO(gzf.read())
        data = np.loadtxt(extracted_buf,
                          skiprows=SKIP_ROWS,
                          usecols=COLUMN_NUMBERS,
                          dtype=DR7_DTYPE)
        np.save(archive_file, data)

    else:
        data = np.load(archive_file)

    return data