# load libraries
library(pdftools)
library(stringr)
library(datapasta)
library(purrr)
First approach to parsing PDF’s from the pdftools
package:
# download the file
download.file("http://portal.ct.gov/-/media/sots/ElectionServices/Registration_and_Enrollment_Stats/Nov17RPES.pdf?la=en", "Nov17RPES.pdf", mode = "wb")
txt <- pdf_text("Nov17RPES.pdf")
# first page text
cat(txt[1])
## Connecticut Secretary of the State
## Registration and Party Enrollment Statistics as of November 1, 2017
## Based on reports as provided to the Secretary of the State by the Registrars of Voters. **See last page for Key to Minor Parties.
## Republican Party Democratic Party Minor Parties Unaffiliated Totals
## Town County C D Active Inactive Total Active Inactive Total Active Inactive Total M Notes Active Inactive Total Active Inactive Totals
## Andover Tolland 2 596 3 599 740 2 742 47 0 47 G IT L 887 5 892 2,270 10 2,280
## Ansonia New Haven 3 1596 135 1,731 3167 405 3,572 130 7 137 G IT L 4060 626 4,686 8,953 1,173 10,126
## Ashford Windham 2 579 59 638 961 104 1,065 37 4 41 G IT L 1084 276 1,360 2,661 443 3,104
## Avon Hartford 5 4007 274 4,281 3674 299 3,973 126 9 135 G IT L WF 4862 613 5,475 12,669 1,195 13,864
## Barkhamsted Litchfield 1 867 16 883 714 27 741 35 1 36 G IT L 1200 43 1,243 2,816 87 2,903
## Beacon Falls New Haven 3 1091 33 1,124 1114 36 1,150 63 2 65 G IT L WF 2090 64 2,154 4,358 135 4,493
## Berlin Hartford 1 3518 133 3,651 4780 218 4,998 75 5 80 G IT L WF 5441 338 5,779 13,814 694 14,508
## Bethany New Haven 3 1152 44 1,196 1165 46 1,211 54 2 56 G IT L 1863 83 1,946 4,234 175 4,409
## Bethel Fairfield 5 3335 80 3,415 3355 147 3,502 199 3 202 G IT L PRO WF 5345 237 5,582 12,234 467 12,701
## Bethlehem Litchfield 5 898 37 935 677 55 732 52 1 53 G IT L WF 1128 154 1,282 2,755 247 3,002
## Bloomfield Hartford 1 1416 90 1,506 8600 308 8,908 140 3 143 G IT L WF 4094 244 4,338 14,250 645 14,895
## Bolton Tolland 2 976 68 1,044 1006 89 1,095 49 5 54 G IT L 1339 160 1,499 3,370 322 3,692
## Bozrah New London 2 359 13 372 600 21 621 39 2 41 G IT L 795 31 826 1,793 67 1,860
## Branford New Haven 3 3644 124 3,768 6784 236 7,020 181 4 185 G IT L WF 9514 322 9,836 20,123 686 20,809
## Bridgeport Fairfield 4 4128 820 4,948 45573 7345 52,918 504 27 531 G IT L WF 19030 4446 23,476 69,235 12,638 81,873
## Bridgewater Litchfield 5 421 0 421 412 540 952 25 0 25 G IT L 466 0 466 1,324 540 1,864
## Bristol Hartford 1 6911 206 7,117 12739 96 12,835 605 22 627 G IT L WF 14025 779 14,804 34,280 1,103 35,383
## Brookfield Fairfield 5 3796 136 3,932 2678 21 2,699 240 10 250 ABP G IT L UIT 4918 208 5,126 11,632 375 12,007
## Brooklyn Windham 2 1262 27 1,289 1436 66 1,502 95 2 97 G IT L WF 2670 58 2,728 5,463 153 5,616
## Burlington Hartford 5 1874 41 1,915 1668 21 1,689 79 5 84 G IT L 2717 158 2,875 6,338 225 6,563
## Canaan Litchfield 5 165 16 181 269 0 269 4 0 4 G IT 267 45 312 705 61 766
## Canterbury Windham 2 1077 18 1,095 786 13 799 68 1 69 CF G IT L WF 1599 69 1,668 3,530 101 3,631
## Canton Hartford 5 2260 88 2,348 2321 96 2,417 87 7 94 G IT L 2865 151 3,016 7,533 342 7,875
## Chaplin Windham 2 478 4 482 418 11 429 16 0 16 IT L WF 581 21 602 1,493 36 1,529
# split each line of the first page
txt1 <- strsplit(txt[1], "\n")
# 1-3 index are meta data
head(txt1[[1]][4])
## [1] " Republican Party Democratic Party Minor Parties Unaffiliated Totals"
# define column names
columns <- c(strsplit(txt1[[1]][5], " "))
columns2 <- c(columns[[1]][1], columns[[1]][10],
paste(columns[[1]][18], columns[[1]][19]),
paste("Republican", columns[[1]][20]),
paste("Republican", columns[[1]][21]),
paste("Republican", columns[[1]][22]),
paste("Democratic", columns[[1]][25]),
paste("Democratic", columns[[1]][26]),
paste("Democratic", columns[[1]][27]),
paste("Minor", columns[[1]][29]),
paste("Minor", columns[[1]][30]),
paste("Minor", columns[[1]][31]),
paste("Minor", columns[[1]][32], columns[[1]][33]),
paste("Unaffiliated", columns[[1]][53]),
paste("Unaffiliated", columns[[1]][55]),
paste("Unaffiliated", columns[[1]][56]),
paste("Totals", columns[[1]][60]),
paste("Totals", columns[[1]][61]),
paste("Totals", columns[[1]][62])
)
columns2
## [1] "Town" "County"
## [3] "C D" "Republican Active"
## [5] "Republican Inactive" "Republican Total"
## [7] "Democratic Active" "Democratic Inactive"
## [9] "Democratic Total" "Minor Active"
## [11] "Minor Inactive" "Minor Total"
## [13] "Minor M Notes" "Unaffiliated Active"
## [15] "Unaffiliated Inactive" "Unaffiliated Total"
## [17] "Totals Active" "Totals Inactive"
## [19] "Totals Totals"
# can't use the above indicies since the notes columns makes the vector for each town variable
str_extract_all(txt1[[1]][7], "\\w+")
## [[1]]
## [1] "Ansonia" "New" "Haven" "3" "1596" "135" "1"
## [8] "731" "3167" "405" "3" "572" "130" "7"
## [15] "137" "G" "IT" "L" "4060" "626" "4"
## [22] "686" "8" "953" "1" "173" "10" "126"
New approach using the datapasta
package for the first page:
# a little copy/paste from the original pdf
## RStudio addin for 'paste as a vector (vertical)'
cp_data <- c("Town County C D Active Inactive Total Active Inactive Total Active Inactive Total M Notes Active Inactive Total Active Inactive Totals",
"Andover Tolland 2 596 3 599 740 2 742 47 0 47 G IT L 887 5 892 2,270 10 2,280",
"Ansonia New Haven 3 1596 135 1,731 3167 405 3,572 130 7 137 G IT L 4060 626 4,686 8,953 1,173 10,126",
"Ashford Windham 2 579 59 638 961 104 1,065 37 4 41 G IT L 1084 276 1,360 2,661 443 3,104",
"Avon Hartford 5 4007 274 4,281 3674 299 3,973 126 9 135 G IT L WF 4862 613 5,475 12,669 1,195 13,864",
"Barkhamsted Litchfield 1 867 16 883 714 27 741 35 1 36 G IT L 1200 43 1,243 2,816 87 2,903",
"Beacon Falls New Haven 3 1091 33 1,124 1114 36 1,150 63 2 65 G IT L WF 2090 64 2,154 4,358 135 4,493",
"Berlin Hartford 1 3518 133 3,651 4780 218 4,998 75 5 80 G IT L WF 5441 338 5,779 13,814 694 14,508",
"Bethany New Haven 3 1152 44 1,196 1165 46 1,211 54 2 56 G IT L 1863 83 1,946 4,234 175 4,409",
"Bethel Fairfield 5 3335 80 3,415 3355 147 3,502 199 3 202 G IT L PRO WF 5345 237 5,582 12,234 467 12,701",
"Bethlehem Litchfield 5 898 37 935 677 55 732 52 1 53 G IT L WF 1128 154 1,282 2,755 247 3,002",
"Bloomfield Hartford 1 1416 90 1,506 8600 308 8,908 140 3 143 G IT L WF 4094 244 4,338 14,250 645 14,895",
"Bolton Tolland 2 976 68 1,044 1006 89 1,095 49 5 54 G IT L 1339 160 1,499 3,370 322 3,692",
"Bozrah New London 2 359 13 372 600 21 621 39 2 41 G IT L 795 31 826 1,793 67 1,860",
"Branford New Haven 3 3644 124 3,768 6784 236 7,020 181 4 185 G IT L WF 9514 322 9,836 20,123 686 20,809",
"Bridgeport Fairfield 4 4128 820 4,948 45573 7345 52,918 504 27 531 G IT L WF 19030 4446 23,476 69,235 12,638 81,873",
"Bridgewater Litchfield 5 421 0 421 412 540 952 25 0 25 G IT L 466 0 466 1,324 540 1,864",
"Bristol Hartford 1 6911 206 7,117 12739 96 12,835 605 22 627 G IT L WF 14025 779 14,804 34,280 1,103 35,383",
"Brookfield Fairfield 5 3796 136 3,932 2678 21 2,699 240 10 250 ABP G IT L UIT 4918 208 5,126 11,632 375 12,007",
"Brooklyn Windham 2 1262 27 1,289 1436 66 1,502 95 2 97 G IT L WF 2670 58 2,728 5,463 153 5,616",
"Burlington Hartford 5 1874 41 1,915 1668 21 1,689 79 5 84 G IT L 2717 158 2,875 6,338 225 6,563",
"Canaan Litchfield 5 165 16 181 269 0 269 4 0 4 G IT 267 45 312 705 61 766",
"Canterbury Windham 2 1077 18 1,095 786 13 799 68 1 69 CF G IT L WF 1599 69 1,668 3,530 101 3,631",
"Canton Hartford 5 2260 88 2,348 2321 96 2,417 87 7 94 G IT L 2865 151 3,016 7,533 342 7,875",
"Chaplin Windham 2 478 4 482 418 11 429 16 0 16 IT L WF 581 21 602 1,493 36 1,529")
# fix two word counties for parsing
nh <- str_extract(cp_data, "New Haven")
nl <- str_extract(cp_data, "New London")
cp_data[!is.na(nh)]
## [1] "Ansonia New Haven 3 1596 135 1,731 3167 405 3,572 130 7 137 G IT L 4060 626 4,686 8,953 1,173 10,126"
## [2] "Beacon Falls New Haven 3 1091 33 1,124 1114 36 1,150 63 2 65 G IT L WF 2090 64 2,154 4,358 135 4,493"
## [3] "Bethany New Haven 3 1152 44 1,196 1165 46 1,211 54 2 56 G IT L 1863 83 1,946 4,234 175 4,409"
## [4] "Branford New Haven 3 3644 124 3,768 6784 236 7,020 181 4 185 G IT L WF 9514 322 9,836 20,123 686 20,809"
cp_data <- str_replace(cp_data, "New Haven", "NewHaven")
cp_data <- str_replace(cp_data, "New London", "NewLondon")
# extract between spaces
# now each row is a nest list
# the 1 vector is the headers
cp_data <- str_extract_all(cp_data[2:24], boundary("word"))
cp_data
## [[1]]
## [1] "Andover" "Tolland" "2" "596" "3" "599" "740"
## [8] "2" "742" "47" "0" "47" "G" "IT"
## [15] "L" "887" "5" "892" "2,270" "10" "2,280"
##
## [[2]]
## [1] "Ansonia" "NewHaven" "3" "1596" "135" "1,731"
## [7] "3167" "405" "3,572" "130" "7" "137"
## [13] "G" "IT" "L" "4060" "626" "4,686"
## [19] "8,953" "1,173" "10,126"
##
## [[3]]
## [1] "Ashford" "Windham" "2" "579" "59" "638" "961"
## [8] "104" "1,065" "37" "4" "41" "G" "IT"
## [15] "L" "1084" "276" "1,360" "2,661" "443" "3,104"
##
## [[4]]
## [1] "Avon" "Hartford" "5" "4007" "274" "4,281"
## [7] "3674" "299" "3,973" "126" "9" "135"
## [13] "G" "IT" "L" "WF" "4862" "613"
## [19] "5,475" "12,669" "1,195" "13,864"
##
## [[5]]
## [1] "Barkhamsted" "Litchfield" "1" "867" "16"
## [6] "883" "714" "27" "741" "35"
## [11] "1" "36" "G" "IT" "L"
## [16] "1200" "43" "1,243" "2,816" "87"
## [21] "2,903"
##
## [[6]]
## [1] "Beacon" "Falls" "NewHaven" "3" "1091" "33"
## [7] "1,124" "1114" "36" "1,150" "63" "2"
## [13] "65" "G" "IT" "L" "WF" "2090"
## [19] "64" "2,154" "4,358" "135" "4,493"
##
## [[7]]
## [1] "Berlin" "Hartford" "1" "3518" "133" "3,651"
## [7] "4780" "218" "4,998" "75" "5" "80"
## [13] "G" "IT" "L" "WF" "5441" "338"
## [19] "5,779" "13,814" "694" "14,508"
##
## [[8]]
## [1] "Bethany" "NewHaven" "3" "1152" "44" "1,196"
## [7] "1165" "46" "1,211" "54" "2" "56"
## [13] "G" "IT" "L" "1863" "83" "1,946"
## [19] "4,234" "175" "4,409"
##
## [[9]]
## [1] "Bethel" "Fairfield" "5" "3335" "80"
## [6] "3,415" "3355" "147" "3,502" "199"
## [11] "3" "202" "G" "IT" "L"
## [16] "PRO" "WF" "5345" "237" "5,582"
## [21] "12,234" "467" "12,701"
##
## [[10]]
## [1] "Bethlehem" "Litchfield" "5" "898" "37"
## [6] "935" "677" "55" "732" "52"
## [11] "1" "53" "G" "IT" "L"
## [16] "WF" "1128" "154" "1,282" "2,755"
## [21] "247" "3,002"
##
## [[11]]
## [1] "Bloomfield" "Hartford" "1" "1416" "90"
## [6] "1,506" "8600" "308" "8,908" "140"
## [11] "3" "143" "G" "IT" "L"
## [16] "WF" "4094" "244" "4,338" "14,250"
## [21] "645" "14,895"
##
## [[12]]
## [1] "Bolton" "Tolland" "2" "976" "68" "1,044" "1006"
## [8] "89" "1,095" "49" "5" "54" "G" "IT"
## [15] "L" "1339" "160" "1,499" "3,370" "322" "3,692"
##
## [[13]]
## [1] "Bozrah" "NewLondon" "2" "359" "13"
## [6] "372" "600" "21" "621" "39"
## [11] "2" "41" "G" "IT" "L"
## [16] "795" "31" "826" "1,793" "67"
## [21] "1,860"
##
## [[14]]
## [1] "Branford" "NewHaven" "3" "3644" "124" "3,768"
## [7] "6784" "236" "7,020" "181" "4" "185"
## [13] "G" "IT" "L" "WF" "9514" "322"
## [19] "9,836" "20,123" "686" "20,809"
##
## [[15]]
## [1] "Bridgeport" "Fairfield" "4" "4128" "820"
## [6] "4,948" "45573" "7345" "52,918" "504"
## [11] "27" "531" "G" "IT" "L"
## [16] "WF" "19030" "4446" "23,476" "69,235"
## [21] "12,638" "81,873"
##
## [[16]]
## [1] "Bridgewater" "Litchfield" "5" "421" "0"
## [6] "421" "412" "540" "952" "25"
## [11] "0" "25" "G" "IT" "L"
## [16] "466" "0" "466" "1,324" "540"
## [21] "1,864"
##
## [[17]]
## [1] "Bristol" "Hartford" "1" "6911" "206" "7,117"
## [7] "12739" "96" "12,835" "605" "22" "627"
## [13] "G" "IT" "L" "WF" "14025" "779"
## [19] "14,804" "34,280" "1,103" "35,383"
##
## [[18]]
## [1] "Brookfield" "Fairfield" "5" "3796" "136"
## [6] "3,932" "2678" "21" "2,699" "240"
## [11] "10" "250" "ABP" "G" "IT"
## [16] "L" "UIT" "4918" "208" "5,126"
## [21] "11,632" "375" "12,007"
##
## [[19]]
## [1] "Brooklyn" "Windham" "2" "1262" "27" "1,289"
## [7] "1436" "66" "1,502" "95" "2" "97"
## [13] "G" "IT" "L" "WF" "2670" "58"
## [19] "2,728" "5,463" "153" "5,616"
##
## [[20]]
## [1] "Burlington" "Hartford" "5" "1874" "41"
## [6] "1,915" "1668" "21" "1,689" "79"
## [11] "5" "84" "G" "IT" "L"
## [16] "2717" "158" "2,875" "6,338" "225"
## [21] "6,563"
##
## [[21]]
## [1] "Canaan" "Litchfield" "5" "165" "16"
## [6] "181" "269" "0" "269" "4"
## [11] "0" "4" "G" "IT" "267"
## [16] "45" "312" "705" "61" "766"
##
## [[22]]
## [1] "Canterbury" "Windham" "2" "1077" "18"
## [6] "1,095" "786" "13" "799" "68"
## [11] "1" "69" "CF" "G" "IT"
## [16] "L" "WF" "1599" "69" "1,668"
## [21] "3,530" "101" "3,631"
##
## [[23]]
## [1] "Canton" "Hartford" "5" "2260" "88" "2,348"
## [7] "2321" "96" "2,417" "87" "7" "94"
## [13] "G" "IT" "L" "2865" "151" "3,016"
## [19] "7,533" "342" "7,875"
To-do:
# Need to remove extraneous letters after the town, county of the first two objects so that the nest vectors are all the same
#cp_data %>% map_df(`[[`,)
#map(unlist, cp_data[2:23])