Regular expressions for pattern matching

12393 views

                  # Basic matching
email = "user@example.com"
email =~ /@/  # => 4 (position of match)
email.match?(/@/)  # => true

# Capture groups
match = email.match(/(.+)@(.+)\.(.+)/)
match[1]  # => "user"
match[2]  # => "example"
match[3]  # => "com"

# Named captures
match = email.match(/(?<user>.+)@(?<domain>.+)\.(?<tld>.+)/)
match[:user]    # => "user"
match[:domain]  # => "example"
match[:tld]     # => "com"

# Scan - find all matches
text = "My phone is 555-1234 and backup is 555-5678"
phones = text.scan(/\d{3}-\d{4}/)
# => ["555-1234", "555-5678"]

# Scan with groups
text = "Alice: 25, Bob: 30, Charlie: 35"
text.scan(/(\w+): (\d+)/)
# => [["Alice", "25"], ["Bob", "30"], ["Charlie", "35"]]

# gsub - global substitution
text = "Hello World"
text.gsub(/[aeiou]/, '*')  # => "H*ll* W*rld"
text.gsub(/\w+/) { |word| word.capitalize }  # => "Hello World"

# gsub with named captures
date = "2026-02-02"
date.gsub(/(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})/) do |match|
  "#{$~[:month]}/#{$~[:day]}/#{$~[:year]}"
end
# => "02/02/2026"

# sub - single substitution
text.sub(/World/, 'Ruby')  # => "Hello Ruby"

# Split with regex
"one,two;three:four".split(/[,;:]/)
# => ["one", "two", "three", "four"]

                  # Email validation
EMAIL_REGEX = /\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i

def valid_email?(email)
  email.match?(EMAIL_REGEX)
end

# URL validation
URL_REGEX = %r{\Ahttps?://[^\s/$.?\#].[^\s]*\z}i

def valid_url?(url)
  url.match?(URL_REGEX)
end

# Phone number extraction
PHONE_REGEX = /\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/

def extract_phones(text)
  text.scan(PHONE_REGEX)
end

# Extract hashtags
def extract_hashtags(text)
  text.scan(/#\w+/)
end

# Extract mentions
def extract_mentions(text)
  text.scan(/@\w+/)
end

# Remove HTML tags
def strip_html(text)
  text.gsub(/<[^>]+>/, '')
end

# Camel to snake case
def camel_to_snake(str)
  str.gsub(/([A-Z])/, '_\1').downcase.sub(/^_/, '')
end

# Snake to camel case
def snake_to_camel(str)
  str.split('_').map(&:capitalize).join
end

# Validate credit card (Luhn algorithm pattern)
CREDIT_CARD_REGEX = /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/

# Extract version numbers
def extract_version(text)
  text.match(/v?(\d+)\.(\d+)\.(\d+)/)&.captures
end

# Parse CSV-like data
def parse_csv_line(line)
  line.scan(/"([^"]*)"|([^,]+)/).flatten.compact
end

# Redact sensitive data
def redact_ssn(text)
  text.gsub(/\b\d{3}-\d{2}-\d{4}\b/, 'XXX-XX-XXXX')
end

def redact_credit_card(text)
  text.gsub(CREDIT_CARD_REGEX) do |match|
    "XXXX-XXXX-XXXX-" + match.last(4)
  end
end

                  # Lookahead - match if followed by
# Find 'foo' only if followed by 'bar'
text = "foobar foobaz"
text.scan(/foo(?=bar)/)  # => ["foo"] (only first match)

# Negative lookahead
# Find 'foo' only if NOT followed by 'bar'
text.scan(/foo(?!bar)/)  # => ["foo"] (from "foobaz")

# Lookbehind - match if preceded by
# Find digits preceded by '$'
text = "Price: $50, Quantity: 50"
text.scan(/(?<=\$)\d+/)  # => ["50"] (only the price)

# Backreferences - reference earlier captures
# Find repeated words
text = "the the quick brown brown fox"
text.scan(/\b(\w+)\s+\1\b/)  # => ["the", "brown"]

# Named backreferences
/(?<word>\w+)\s+\k<word>/

# Conditional regex
# Match different patterns based on condition
/^(yes|no)\s+(?:(yes)\s+ok|(no)\s+cancel)$/

# Case-insensitive flag
/hello/i.match("HELLO")  # => matches

# Multiline mode
text = "line1\nline2\nline3"
text.scan(/^line/m)  # ^ matches start of each line

# Extended mode - allows whitespace and comments
email_regex = /
  \A
  [\w+\-.]+    # Local part
  @              # At sign
  [a-z\d\-]+  # Domain name
  \.            # Dot
  [a-z]+         # TLD
  \z
/ix  # i = case insensitive, x = extended

# Global match data
if "hello@world.com" =~ /(\w+)@(\w+)\.(\w+)/
  $1  # => "hello"
  $2  # => "world"
  $3  # => "com"
  $&  # => "hello@world.com" (entire match)
  $`  # => "" (before match)
  $'  # => "" (after match)
end

# MatchData object
match = "hello@world.com".match(/(\w+)@(\w+)\.(\w+)/)
match.captures     # => ["hello", "world", "com"]
match.pre_match    # Text before match
match.post_match   # Text after match
match.string       # Original string

Ruby's regex engine provides powerful text processing. I use =~ for matching, match for captures. Character classes \d, \w, \s match digits, words, whitespace. Quantifiers *, +, ?, {n,m} control repetition. Anchors ^ and $ match start/end. Groups () capture subpatterns; (?:) for non-capturing groups. Named captures (?<name>) improve readability. Lookaheads (?=) and lookbehinds (?<=) assert without consuming. scan finds all matches; gsub replaces patterns. Regex literals // and %r{} allow different delimiters. Understanding regex enables text validation, parsing, and transformation. I balance regex power with readability—complex patterns need comments or extraction into methods.

Sarah Mitchell

More from Sarah Mitchell

Regular expressions for pattern matching

0 Comments

More from Sarah Mitchell