% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/class_dmDSdata.R, R/class_dmSQTLdata.R
\docType{methods}
\name{dmFilter}
\alias{dmFilter}
\alias{dmFilter,dmDSdata-method}
\alias{dmFilter,dmSQTLdata-method}
\title{Filtering}
\usage{
dmFilter(x, ...)

\S4method{dmFilter}{dmDSdata}(x, min_samps_gene_expr = 0,
  min_samps_feature_expr = 0, min_samps_feature_prop = 0,
  min_gene_expr = 0, min_feature_expr = 0, min_feature_prop = 0,
  run_gene_twice = FALSE)

\S4method{dmFilter}{dmSQTLdata}(x, min_samps_gene_expr = 0,
  min_samps_feature_expr = 0, min_samps_feature_prop = 0,
  minor_allele_freq = 0.05 * nrow(samples(x)), min_gene_expr = 0,
  min_feature_expr = 0, min_feature_prop = 0,
  BPPARAM = BiocParallel::SerialParam())
}
\arguments{
\item{x}{\code{\linkS4class{dmDSdata}} or \code{\linkS4class{dmSQTLdata}}
object.}

\item{...}{Other parameters that can be defined by methods using this
generic.}

\item{min_samps_gene_expr}{Minimal number of samples where genes should be 
expressed. See Details.}

\item{min_samps_feature_expr}{Minimal number of samples where features should
be expressed. See Details.}

\item{min_samps_feature_prop}{Minimal number of samples where features should
be expressed. See details.}

\item{min_gene_expr}{Minimal gene expression.}

\item{min_feature_expr}{Minimal feature expression.}

\item{min_feature_prop}{Minimal proportion for feature expression. This value
should be between 0 and 1.}

\item{run_gene_twice}{Whether to re-run the gene-level filter
after the feature-level filters.}

\item{minor_allele_freq}{Minimal number of samples where each of the 
genotypes has to be present.}

\item{BPPARAM}{Parallelization method used by 
\code{\link[BiocParallel]{bplapply}}.}
}
\value{
Returns filtered \code{\linkS4class{dmDSdata}} or 
  \code{\linkS4class{dmSQTLdata}} object.
}
\description{
Filtering of genes and features with low expression. Additionally, for the
dmSQTLdata object, filtering of genotypes with low frequency.
}
\details{
Filtering parameters should be adjusted according to the sample size
  of the experiment data and the number of replicates per condition.
  
  \code{min_samps_gene_expr} defines the minimal number of samples where 
  genes are required to be expressed at the minimal level of 
  \code{min_gene_expr} in order to be included in the downstream analysis. 
  Ideally, we would like that genes were expressed at some minimal level in 
  all samples because this would lead to better estimates of feature ratios.
  
  Similarly, \code{min_samps_feature_expr} and \code{min_samps_feature_prop} 
  defines the minimal number of samples where features are required to be 
  expressed at the minimal levels of counts \code{min_feature_expr} or 
  proportions \code{min_feature_prop}. In differential transcript/exon usage
  analysis, we suggest using \code{min_samps_feature_expr} and 
  \code{min_samps_feature_prop} equal to the minimal number of replicates in 
  any of the conditions. For example, in an assay with 3 versus 5 replicates,
  we would set these parameters to 3, which allows a situation where a 
  feature is expressed in one condition but may not be expressed at all in 
  another one, which is an example of differential transcript/exon usage.
  
  By default, all the filtering parameters equal zero which means that 
  features with zero expression in all samples are removed as well as genes 
  with only one non-zero feature.

In QTL analysis, usually, we deal with data that has many more replicates 
than data from a standard differential usage assay. Our example data set 
consists of 91 samples. Requiring that genes are expressed in all samples may
be too stringent, especially since there may be missing values in the data 
and for some genes you may not observe counts in all 91 samples. Slightly 
lower threshold ensures that we do not eliminate such genes. For example, if 
\code{min_samps_gene_expr = 70} and \code{min_gene_expr = 10}, only genes 
with expression of at least 10 in at least 70 samples are kept. Samples with 
expression lower than 10 have \code{NA}s assigned and are skipped in the 
analysis of this gene. \code{minor_allele_freq} indicates the minimal number 
of samples for the minor allele presence. Usually, it is equal to roughly 5\%
of total samples.
}
\examples{
# --------------------------------------------------------------------------
# Create dmDSdata object 
# --------------------------------------------------------------------------
## Get kallisto transcript counts from the 'PasillaTranscriptExpr' package

library(PasillaTranscriptExpr)
\donttest{
data_dir  <- system.file("extdata", package = "PasillaTranscriptExpr")

## Load metadata
pasilla_metadata <- read.table(file.path(data_dir, "metadata.txt"), 
header = TRUE, as.is = TRUE)

## Load counts
pasilla_counts <- read.table(file.path(data_dir, "counts.txt"), 
header = TRUE, as.is = TRUE)

## Create a pasilla_samples data frame
pasilla_samples <- data.frame(sample_id = pasilla_metadata$SampleName, 
  group = pasilla_metadata$condition)
levels(pasilla_samples$group)

## Create a dmDSdata object
d <- dmDSdata(counts = pasilla_counts, samples = pasilla_samples)

## Use a subset of genes, which is defined in the following file
gene_id_subset <- readLines(file.path(data_dir, "gene_id_subset.txt"))

d <- d[names(d) \%in\% gene_id_subset, ]

# --------------------------------------------------------------------------
# Differential transcript usage analysis - simple two group comparison 
# --------------------------------------------------------------------------

## Filtering
## Check what is the minimal number of replicates per condition 
table(samples(d)$group)

d <- dmFilter(d, min_samps_gene_expr = 7, min_samps_feature_expr = 3,
  min_gene_expr = 10, min_feature_expr = 10)

plotData(d)
}
# --------------------------------------------------------------------------
# Create dmSQTLdata object
# --------------------------------------------------------------------------
# Use subsets of data defined in the GeuvadisTranscriptExpr package

library(GeuvadisTranscriptExpr)
\donttest{
geuv_counts <- GeuvadisTranscriptExpr::counts
geuv_genotypes <- GeuvadisTranscriptExpr::genotypes
geuv_gene_ranges <- GeuvadisTranscriptExpr::gene_ranges
geuv_snp_ranges <- GeuvadisTranscriptExpr::snp_ranges

colnames(geuv_counts)[c(1,2)] <- c("feature_id", "gene_id")
colnames(geuv_genotypes)[4] <- "snp_id"
geuv_samples <- data.frame(sample_id = colnames(geuv_counts)[-c(1,2)])

d <- dmSQTLdata(counts = geuv_counts, gene_ranges = geuv_gene_ranges,  
  genotypes = geuv_genotypes, snp_ranges = geuv_snp_ranges, 
  samples = geuv_samples, window = 5e3)

# --------------------------------------------------------------------------
# sQTL analysis - simple group comparison
# --------------------------------------------------------------------------

## Filtering
d <- dmFilter(d, min_samps_gene_expr = 70, min_samps_feature_expr = 5,
  minor_allele_freq = 5, min_gene_expr = 10, min_feature_expr = 10)
  
plotData(d)
}
}
\seealso{
\code{\link{plotData}}
}
\author{
Malgorzata Nowicka
}
