# Install pacman package if not already installed
if (!require(pacman)) install.packages("pacman")
# Load the necessary packages using pacman
::p_load(RSQLite, DBI, data.table, lubridate) pacman
7 Data Aggregation
In this section, we will focus on aggregating the WiFi data to make it more manageable and suitable for analysis. We will start by explaining the steps in detail to perform data aggregation, then create a function to automate the entire process, and finally apply this function to multiple SQLite3 databases in the ch3
folder.
If you have your own SQLite3 database file, you can use it for this process. If not, you can download our example dataset to follow along with the tutorial.
If you don’t have your own dataset, download our example SQLite3 database file here.
7.1 Step-by-Step Data Aggregation
7.1.1 Loading Packages
Before we start, let’s ensure that we have all the necessary packages installed and loaded. We will use the pacman
package to manage our package dependencies.
7.1.2 Loading the Data
First, we will load the WiFi data from the SQLite3 database.
# Establish a connection to the SQLite3 database
<- dbConnect(SQLite(), "material/ch3/sample_1.sqlite3")
conn
# Query the WiFi data from the database
<- dbGetQuery(conn, "SELECT sensor_name, timestamp, type, subtype, strength AS rssi, source_address, source_address_randomized FROM packets")
wifi_data
# Convert to data.table
setDT(wifi_data)
# Print the first few rows to verify
head(wifi_data)
sensor_name timestamp type subtype rssi
<char> <char> <char> <char> <int>
1: A01 2024-04-09T19:17:27.536121 management probe-response -65
2: A01 2024-04-09T19:17:27.541249 management probe-response -67
3: A01 2024-04-09T19:17:27.635933 management probe-response -67
4: A01 2024-04-09T19:17:27.746452 management probe-request -67
5: A01 2024-04-09T19:17:27.765945 management probe-request -65
6: A01 2024-04-09T19:17:27.779055 management probe-request -75
source_address
<char>
1: f0659bdd9305e4341afb9f55df7cd20a4adfd726f83a33c3857281dfa3de8575
2: f0659bdd9305e4341afb9f55df7cd20a4adfd726f83a33c3857281dfa3de8575
3: f0659bdd9305e4341afb9f55df7cd20a4adfd726f83a33c3857281dfa3de8575
4: d94147cf12befe41bb40dd7957733c54442de7a9d45a75ec3c747856c4bdc129
5: d94147cf12befe41bb40dd7957733c54442de7a9d45a75ec3c747856c4bdc129
6: 5e69a0bc9bd73c0b72642e2e0f4f99670b85e8fdf4616bc19fb1f8d63107bfe5
source_address_randomized
<int>
1: 0
2: 0
3: 0
4: 1
5: 1
6: 1
7.1.3 Filtering by Time
To focus on a specific time period for our analysis, we will define a start and end date using the ymd_hms
function from the lubridate
package. We will then filter the data to include only rows within our specified datetime range.
# Define start and end datetime for filtering
<- ymd_hms("2024-04-09 19:17:00")
start_date <- ymd_hms("2024-04-09 19:20:00")
end_date
# Convert timestamp to datetime format and filter data within the specified datetime range
<- wifi_data[
wifi_data_filtered_time between(ymd_hms(timestamp), start_date, end_date)
]
7.1.4 Filtering by Frame Type
Certain subtypes
of packets may not be relevant for our purposes. We will remove rows where the subtype
column contains the word “response”.
# Remove rows where the subtype contains "response"
<- wifi_data_filtered_time[!grepl("response", subtype)] wifi_data_filtered_frame
We remove “response” packets because they are sent from APs to devices, not from devices to APs. In the raw WiFi data collected over a month on a university campus (see the table below), “response” packets account for a large proportion (35.3% for “probe-response”). By filtering them out, we focus on the more relevant packets originating from devices, which provide information about the presence and movement of pedestrians.
Type | Subtype | Raw WiFi data Count | Raw WiFi data Proportion |
---|---|---|---|
Management | probe-request | 714,353 | 2.6% |
Management | probe-response | 9,532,383 | 35.3% |
Management | authentication | 352,856 | 1.3% |
Management | deauthentication | 30,765 | 0.1% |
Management | association-request | 16,043 | 0.1% |
Management | association-response | 13,621 | 0.1% |
Management | reassociation-request | 44,284 | 0.2% |
Management | reassociation-response | 33,429 | 0.1% |
Management | disassociation | 24,525 | 0.1% |
Management | action | 351,477 | 1.3% |
Data | data | 18,546 | 0.1% |
Data | null | 8,716,923 | 32.3% |
Data | qos-data | 4,875,257 | 18.1% |
Data | qos-null | 2,253,010 | 8.4% |
7.1.5 Filtering by Signal Strength
The signal strength (RSSI) of WiFi packets can provide valuable information about the proximity and location of devices. We will filter our data to include only rows where the rssi
column falls between -30 and -80 dBm.
# Filter data to include only rows with signal strength between -30 and -80 dBm
<- wifi_data_filtered_frame[between(rssi, -80, -30)] wifi_data_filtered_strength
We filter the signal strength to remove extreme values. Very strong signals (above -30 dBm) may come from devices very close to the sensor, like a tablet right next to it, which may not represent typical pedestrian behavior. Very weak signals (below -80 dBm) may be less reliable, coming from far away or obstructed devices. Focusing on the -30 to -80 dBm range helps capture data more likely to represent pedestrians moving within a reasonable distance from the sensors.
7.1.6 Aggregating Data by Time Intervals
We will aggregate the data by sensor_name
and source_address
into specified time intervals. This involves rounding the timestamp
to the nearest interval and then summarizing the data.
# Define aggregation function
<- function(data, interval) {
aggregate_intervals := floor_date(ymd_hms(timestamp), unit = interval)]
data[, timestamp <- data[, .(
aggregated_data median_rssi = median(rssi),
count = .N
= .(sensor_name, source_address, source_address_randomized, timestamp)]
), by return(aggregated_data)
}
# Aggregate the data by 1-second intervals
<- aggregate_intervals(wifi_data_filtered_strength, "second")
aggregated_data
# Print the first few rows of the aggregated data
head(aggregated_data)
sensor_name source_address
<char> <char>
1: A01 d94147cf12befe41bb40dd7957733c54442de7a9d45a75ec3c747856c4bdc129
2: A01 5e69a0bc9bd73c0b72642e2e0f4f99670b85e8fdf4616bc19fb1f8d63107bfe5
3: A01 05d29a432f4ff4c5f2e49e185334619d4365ef65370fcf9891bc7b1f8c0a68b6
4: A01 a6a0a285818a48c083c72c885283f1652208b3239f70e859f49067b36781acc6
5: A01 b3268f2d7ca90e7ea3ff549decbf484d478c3eaf28784a7bbfbd5aaee22d3a6a
6: A01 f6e4a5fce8432422779b9e68da551a19b24b749ddbd58735bd95334747258d66
source_address_randomized timestamp median_rssi count
<int> <POSc> <num> <int>
1: 1 2024-04-09 19:17:27 -66 2
2: 1 2024-04-09 19:17:27 -75 2
3: 0 2024-04-09 19:17:27 -78 2
4: 0 2024-04-09 19:17:27 -75 1
5: 1 2024-04-09 19:17:27 -78 2
6: 0 2024-04-09 19:17:28 -76 2
7.1.7 Saving Aggregated Data
Finally, we will save the aggregated data to a CSV file for future use.
# Save the aggregated data to a CSV file
fwrite(aggregated_data, "material/ch3/aggregated_sample_1.csv")
7.1.8 Closing the Database Connection
After loading and processing the data, it is important to close the database connection to free up resources:
dbDisconnect(conn)
7.2 Tracking Data Changes
We will create a table that shows how the number of packets and unique source_address
changes through each step of the data aggregation process.
# Initialize summary table
<- data.table(
summary_table Step = character(),
Packet_Count = integer(),
Unique_Source_Addresses = integer()
)
# Initial counts
<- rbind(summary_table, data.table(
summary_table Step = "Initial",
Packet_Count = nrow(wifi_data),
Unique_Source_Addresses = length(unique(wifi_data$source_address))
))
# After filtering by datetime range
<- rbind(summary_table, data.table(
summary_table Step = "After Datetime Filter",
Packet_Count = nrow(wifi_data_filtered_time),
Unique_Source_Addresses = length(unique(wifi_data_filtered_time$source_address))
))
# After filtering by frame type
<- rbind(summary_table, data.table(
summary_table Step = "After Frame Type Filter",
Packet_Count = nrow(wifi_data_filtered_frame),
Unique_Source_Addresses = length(unique(wifi_data_filtered_frame$source_address))
))
# After filtering by signal strength
<- rbind(summary_table, data.table(
summary_table Step = "After Strength Filter",
Packet_Count = nrow(wifi_data_filtered_strength),
Unique_Source_Addresses = length(unique(wifi_data_filtered_strength$source_address))
))
# After aggregation
<- rbind(summary_table, data.table(
summary_table Step = "After Aggregation",
Packet_Count = nrow(aggregated_data),
Unique_Source_Addresses = length(unique(aggregated_data$source_address))
))
# Print the summary table
print(summary_table)
Step Packet_Count Unique_Source_Addresses
<char> <int> <int>
1: Initial 11490 323
2: After Datetime Filter 5274 163
3: After Frame Type Filter 3380 123
4: After Strength Filter 2904 112
5: After Aggregation 522 112
Here is the interpretation:
- Initial: The raw data contains 11,490 packets from 323 unique devices.
- After Datetime Filter: Filtering by the specified datetime range reduces the data to 5,274 packets from 163 unique devices.
- After Frame Type Filter: Removing “response” frame types further reduces the data to 3,380 packets from 123 unique devices.
- After Strength Filter: Filtering by signal strength between -30 and -80 dBm results in 2,904 packets from 112 unique devices.
- After Aggregation: Aggregating the data into 1-second intervals leaves us with 522 aggregated data points from the same 112 unique devices.
Each step of the filtering process significantly reduces the amount of data, refining it to include only the most relevant packets and maintaining the same number of unique devices after the final aggregation step.
7.3 Automating the Process with a Function
To streamline the process, we will create a function that performs the entire data aggregation process. This function will: 1. Load the data from the SQLite3 database. 2. Perform basic filtering by datetime, frame type, and signal strength. 3. Aggregate the data by sensor_name
and source_address
into specified time intervals. 4. Save the aggregated data to a CSV file.
<- function(db_path, start_date, end_date, interval = "second", output_suffix = "_1second.csv") {
aggregate_data # Establish a connection to the SQLite3 database
<- dbConnect(SQLite(), db_path)
conn
# Query the WiFi data from the database
<- dbGetQuery(conn, "SELECT sensor_name, timestamp, type, subtype, strength AS rssi, source_address, source_address_randomized FROM packets")
wifi_data setDT(wifi_data)
# Filter by datetime range
<- wifi_data[between(ymd_hms(timestamp), start_date, end_date)]
wifi_data
# Filter by frame type
<- wifi_data[!grepl("response", subtype)]
wifi_data
# Filter by signal strength
<- wifi_data[between(rssi, -80, -30)]
wifi_data
# Aggregate the data by specified intervals
:= floor_date(ymd_hms(timestamp), unit = interval)]
wifi_data[, timestamp <- wifi_data[, .(median_rssi = median(rssi), count = .N), by = .(sensor_name, source_address, source_address_randomized, timestamp)]
aggregated_data
# Define the output path for the aggregated data
<- sub("\\.sqlite3$", output_suffix, db_path)
output_path
# Save the aggregated data to a CSV file
fwrite(aggregated_data, output_path)
# Close the database connection
dbDisconnect(conn)
}
# Example usage
<- ymd_hms("2024-04-09 19:17:00")
start_date <- ymd_hms("2024-04-09 19:20:00")
end_date aggregate_data("material/ch3/sample_1.sqlite3", start_date, end_date, interval = "second")
Applying the Function to Multiple Databases
We will apply the aggregate_data
function to all SQLite3 databases in the ch3
folder that contain the word “sample” in their filenames. The resulting CSV files will be saved with “_1second” appended to their original names. Using lapply
is effective for applying a function over a list, but we can also use purrr::map
for better readability and functionality.
::p_load(purrr)
pacman
# List all SQLite3 files in the ch3 folder that contain "sample_"
<- list.files("material/ch3", pattern = "sample_.*\\.sqlite3$", full.names = TRUE)
db_files print(db_files)
[1] "material/ch3/sample_1.sqlite3" "material/ch3/sample_2.sqlite3"
# Define the start and end dates for filtering
<- ymd_hms("2024-04-09 19:17:00")
start_date <- ymd_hms("2024-04-09 19:20:00")
end_date
# Apply the aggregation function to each database file
map(db_files, ~aggregate_data(.x, start_date, end_date, interval = "second"))
[[1]]
[1] TRUE
[[2]]
[1] TRUE